clean up

navapbc · Apr 30, 2024 · e6c28f9 · e6c28f9
1 parent 9ffc282
commit e6c28f9
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 88 deletions.
diff --git a/02-household-queries/decompose-questions.py b/02-household-queries/decompose-questions.py
@@ -3,20 +3,22 @@
 ##
 # Use specified LLM to decompose a set of user questions into
 # derived/decomposed questions, which will be used to retrieve Guru cards.
-# Evaluate the retrieval performance.
+# Also evaluates the Guru card retrieval performance.
 
-# import time
 import os
 import json
 import csv
+import sys
+import traceback
+import dotenv
 
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.vectorstores import Chroma
 
 import dspy
-import dspy_engine
 
-# from retrieval import create_retriever
+# print("Loading our libraries...")
+import dspy_engine
 import ingest
 import debugging
 
@@ -71,8 +73,6 @@ def cache_derived_questions(llm_model, predictor):
             add_transformation(indexed_qs, qa_dict["id"], question, llm_model, derived_questions)
         except Exception as e:
             print("  => Error:", e)
-            import traceback
-
             traceback.print_exc()
             # dspy_engine.print_last_llm_history()
             break
@@ -90,10 +90,6 @@ def create_predictor(llm_choice):
     )
     print("LLM model created", dspy.settings.lm)
 
-    system_prompt = 'Decompose into multiple questions so that we can search for relevant SNAP and food assistance eligibility rules. Be concise -- only respond with JSON. Only output the questions as a JSON list: ["question1", "question2", ...]'
-
-    transform_prompt = system_prompt + "The question is: {question}"
-
     class DecomposeQuestion(dspy.Signature):
         """Decompose into multiple questions so that we can search for relevant SNAP and food assistance eligibility rules. \
 Be concise -- only respond with JSON. Only output the questions as a JSON list: ["question1", "question2", ...]. \
@@ -137,7 +133,6 @@ def eval_retrieval(llm_model, qa, derived_qs, vectordb, retrieve_k=5):
     for qa_dict in qa:
         question = qa_dict["orig_question"]
         guru_cards = qa_dict.get("guru_cards", [])
-        # debugging.debug_here(locals())
         if not narrowed_qs[question]:
             print(f"Derived questions not found -- Skipping {question}")
             continue
@@ -188,7 +183,7 @@ def eval_retrieval(llm_model, qa, derived_qs, vectordb, retrieve_k=5):
     return eval_results
 
 
-def main0():
+def main0_ingest_guru_cards():
     def ingest_call(
         vectordb,
         embedding_name=None,
@@ -217,49 +212,75 @@ def ingest_call(
     ingest_call(vectordb=vectordb)
 
 
-def main1():
-    llm_model = _llm_model_name
+def main1_decompose_user_questions():
+    llm_model = os.environ.get("LLM_MODEL_NAME", "openhermes")
+    print(f"LLM_MODEL_NAME: {llm_model}")
     predictor = create_predictor(llm_model)
     print("Predictor created", predictor)
     cache_derived_questions(llm_model, predictor)
 
 
-def save_summary_csv(filename, eval_results):
-    with open(filename, "w", encoding="utf-8") as file:
-        result_fields = ["id", "derived_questions_count", "recall", "extra_cards", "retrieved_cards_count"]
-        writer = csv.DictWriter(file, fieldnames=result_fields, extrasaction="ignore")
-        writer.writeheader()
-
-        for r in eval_results:
-            r["derived_questions_count"] = len(r["derived_questions"])
-            r["retrieved_cards_count"] = len(r["all_retrieved_cards"])
-            writer.writerow(r)
-
-
-def main2():
-    llm_model = _llm_model_name
+def main2_evaluate_retrieval():
     derived_qs = load_derived_questions_cache()
-
+    list_models(derived_qs)
     vectordb = create_vectordb()
-    # retriever = create_retriever(vectordb)
-
     qa = load_user_questions()
 
+    llm_model = os.environ.get("LLM_MODEL_NAME", "openhermes")
+    print(f"LLM_MODEL_NAME: {llm_model}")
     retrieve_k = int(os.environ.get("RETRIEVE_K", "4"))
+    print("RETRIEVE_K:", retrieve_k)
     eval_results = eval_retrieval(llm_model, qa, derived_qs, vectordb, retrieve_k)
 
-    # timestamp_str = time.strftime('%Y-%m-%d-%H%M%S')
     with open(f"qt-retrieval-eval_results-{llm_model}-k_{retrieve_k}.json", "w", encoding="utf-8") as f:
         json.dump(eval_results, f, indent=4)
 
     save_summary_csv(f"qt-retrieval-eval_results-{llm_model}-k_{retrieve_k}.csv", eval_results)
-    print("\nResult summary:")
+    print("\nResult summary: (id, recall, extra_cards, retrieved_cards_count)")
     for r in eval_results:
         print(r["id"], r["recall"], r["extra_cards"], len(r["all_retrieved_cards"]))
 
 
-print("Running...")
-_llm_model_name = os.environ.get("LLM_MODEL_NAME", "openhermes")
-main2()
+def save_summary_csv(filename, eval_results):
+    with open(filename, "w", encoding="utf-8") as file:
+        result_fields = ["id", "derived_questions_count", "recall", "extra_cards", "retrieved_cards_count"]
+        writer = csv.DictWriter(file, fieldnames=result_fields, extrasaction="ignore")
+        writer.writeheader()
+
+        for r in eval_results:
+            r["derived_questions_count"] = len(r["derived_questions"])
+            r["retrieved_cards_count"] = len(r["all_retrieved_cards"])
+            writer.writerow(r)
+
 
-# RETRIEVE_K=4 for LLM_MODEL_NAME in mistral:instruct gpt-3.5-turbo gemini-1.0-pro llama3-70b-8192; do echo $LLM_MODEL_NAME; ./decompose-questions.py > "qt-retrieval-eval-$LLM_MODEL_NAME-k_$RETRIEVE_K.log"; echo .; done
+def list_models(derived_qs):
+    models = set()
+    for item in derived_qs:
+        models.update(item["transformations"].keys())
+    cached_models = sorted(models)
+    print("Available LLM models in derived questions cache:", " ".join(f"'{m}'" for m in cached_models))
+    return cached_models
+
+
+if __name__ == "__main__":
+    print("""
+    0. ingest Guru cards into vector DB
+    1. cache decomposed/derived questions
+    2. evaluate Guru card retrieval""")
+    dotenv.load_dotenv()
+    if args := sys.argv[1:]:
+        choice = args[0]
+        print("Running option:", choice)
+    else:
+        print("What would you like to do?")
+        choice = input()
+
+    if choice in ["0", "ingest"]:
+        main0_ingest_guru_cards()
+    elif choice in ["1", "decompose"]:
+        main1_decompose_user_questions()
+    elif choice in ["2", "evaluate"]:
+        main2_evaluate_retrieval()
+
+# Copy and paste to evaluate several models on the command line:
+# RETRIEVE_K=4 for LLM_MODEL_NAME in mistral:instruct ...; do ./decompose-questions.py evaluate > "qt-retrieval-eval-$LLM_MODEL_NAME-k_$RETRIEVE_K.log"; echo .; done
diff --git a/02-household-queries/dspy_engine.py b/02-household-queries/dspy_engine.py
@@ -17,9 +17,6 @@
 import debugging
 
 
-dotenv.load_dotenv()
-
-
 class BasicQA(dspy.Signature):
     """Answer questions with short answers."""
 
@@ -83,13 +80,12 @@ class RAG(dspy.Module):
     def __init__(self, num_passages):
         super().__init__()
         self.retrieve = dspy.Retrieve(k=num_passages)
-        signature = GenerateAnswer
         self.generate_answer = dspy.ChainOfThought(
-            signature,
-            rationale_type=dspy.OutputField(
-                prefix="Reasoning: Let's think step by step in order to",
-                desc="${produce the " + "answer" + "}. We ...",
-            ),
+            GenerateAnswer,
+            # rationale_type=dspy.OutputField(
+            #     prefix="Reasoning: Let's think step by step in order to",
+            #     desc="${produce the " + "answer" + "}. We ...",
+            # ),
         )
 
     def forward(self, question):
@@ -409,6 +405,7 @@ def examples_from(qa):
 
 
 if __name__ == "__main__":
+    dotenv.load_dotenv()
     examples_qa = examples_from(load_training_json())
 
     # main_baseline(examples_qa[0].question)

diff --git a/02-household-queries/ingest.py b/02-household-queries/ingest.py
@@ -1,7 +1,9 @@
-from bs4 import BeautifulSoup
 import os
-import dotenv
 import json
+import dotenv
+
+from bs4 import BeautifulSoup
+
 from langchain_community.document_loaders import PDFMinerLoader
 from langchain.docstore.document import Document
 from langchain_text_splitters import (
@@ -18,40 +20,51 @@
 
 from llm import ollama_client
 
-
-dotenv.load_dotenv()
-
-# _llm_model_name = os.environ.get("LLM_MODEL_NAME", "mistral")
-
-# llm = ollama_client(_llm_model_name, settings={"temperature": 0.1})
-
-
-EMBEDDINGS = {
-    "all-MiniLM-L6-v2": {
-        "func": SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"),
-        "token_limit": 256,
-    },
-    "HuggingFace::all-MiniLM-L6-v2": {
-        "func": HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
-        "token_limit": 256,
-    },
-    # "Google::models/embedding-001": {
-    #     "func": GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
-    #     "token_limit": 2048,
-    # },
-    # "google_models/text-embedding-004": {
-    #     "func": GoogleGenerativeAIEmbeddings(model="models/text-embedding-004"),
-    #     "token_limit": 768,
-    # },
-    "BAAI/bge-small-en-v1.5": {
-        "func": SentenceTransformerEmbeddings(model_name="BAAI/bge-small-en-v1.5"),
-        "token_limit": 512,
-    },
-    "mixedbread-ai/mxbai-embed-large-v1": {
-        "func": SentenceTransformerEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1"),
-        "token_limit": 1024,
-    },
-}
+_llm = None
+
+
+def get_llm():
+    global _llm
+    if not _llm:
+        dotenv.load_dotenv()
+        _llm_model_name = os.environ.get("LLM_MODEL_NAME", "mistral")
+        _llm = ollama_client(_llm_model_name, settings={"temperature": 0.1})
+    return _llm
+
+
+_embeddings = None
+
+
+def get_embeddings():
+    global _embeddings
+    if not _embeddings:
+        _embeddings = {
+            "all-MiniLM-L6-v2": {
+                "func": SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"),
+                "token_limit": 256,
+            },
+            "HuggingFace::all-MiniLM-L6-v2": {
+                "func": HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
+                "token_limit": 256,
+            },
+            # "Google::models/embedding-001": {
+            #     "func": GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
+            #     "token_limit": 2048,
+            # },
+            # "google_models/text-embedding-004": {
+            #     "func": GoogleGenerativeAIEmbeddings(model="models/text-embedding-004"),
+            #     "token_limit": 768,
+            # },
+            "BAAI/bge-small-en-v1.5": {
+                "func": SentenceTransformerEmbeddings(model_name="BAAI/bge-small-en-v1.5"),
+                "token_limit": 512,
+            },
+            "mixedbread-ai/mxbai-embed-large-v1": {
+                "func": SentenceTransformerEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1"),
+                "token_limit": 1024,
+            },
+        }
+    return _embeddings
 
 
 # split text into chunks
@@ -69,7 +82,7 @@ def get_text_chunks_langchain(text, source, chunk_size, chunk_overlap, token_lim
     if not silent:
         print("  Split into", len(texts))
     for t in texts:
-        token_count = llm.get_num_tokens(t)
+        token_count = get_llm().get_num_tokens(t)
         if token_count > token_limit:
             print(f"Exceeded token limit of {token_limit}: {token_count};")
 
@@ -87,7 +100,7 @@ def get_text_chunks_langchain(text, source, chunk_size, chunk_overlap, token_lim
 # Chunk the pdf and load into vector db
 def add_pdf_to_vector_db(vectordb, file_path, embedding_name=None, chunk_size=500, chunk_overlap=100):
     if embedding_name:
-        check_embedding(chunk_size, EMBEDDINGS.get(embedding_name, ""))
+        check_embedding(chunk_size, get_embeddings().get(embedding_name, ""))
     # PDFMinerLoader only gives metadata when extract_images=True due to default using lazy_loader
     loader = PDFMinerLoader(file_path, extract_images=True)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
@@ -112,7 +125,7 @@ def add_json_html_data_to_vector_db(
     data_file = open(file_path, encoding="utf-8")
     json_data = json.load(data_file)
     if embedding_name:
-        check_embedding(chunk_size, EMBEDDINGS.get(embedding_name, ""))
+        check_embedding(chunk_size, get_embeddings().get(embedding_name, ""))
     for content in json_data:
         if not content[index_key].strip().endswith("?"):
             continue

diff --git a/02-household-queries/question-transformations.json b/02-household-queries/question-transformations.json
@@ -82,19 +82,19 @@
     "q_id": 3,
     "question": "The client rents a room in a house. He said he wants to apply on his own, but then he admitted he shares a kitchen with his roommates and they often cook and eat together. Do the roommates need to be included?",
     "transformations": {
-      "OpenAI/gpt-3.5-turbo": [
+      "_manual-OpenAI/gpt-3.5-turbo": [
         "Is shared kitchen use with roommates a factor in determining SNAP eligibility for an individual renting a room in a house?",
         "Are individuals who share a kitchen with the applicant considered part of the household for SNAP eligibility purposes?",
         "Does the frequency of cooking and eating together impact SNAP eligibility?",
         "Does the rental arrangement (renting a room in a house) affect SNAP eligibility differently compared to renting an entire apartment or house?",
         "Are there any specific criteria or guidelines for determining SNAP eligibility when multiple individuals share a kitchen but have separate living arrangements within the same house?",
         "How does SNAP eligibility treat situations where roommates contribute to shared expenses such as groceries or utilities?"
       ],
-      "Google/gemini": [
+      "_manual-Google/gemini": [
         "Does a person who rents a room in a house qualify as a separate SNAP household?",
         "If a SNAP applicant shares a kitchen with roommates and occasionally cooks and eats meals together, does this require including the roommates in the SNAP application?"
       ],
-      "Anthopic/claude": [
+      "_manual-Anthopic/claude": [
         "Do individuals who rent a room in a house and share a kitchen with roommates typically qualify as a separate household for SNAP purposes?",
         "Does cooking and eating together with roommates affect an individual's eligibility to apply for SNAP benefits separately?",
         "Under what circumstances are roommates required to be included in a single SNAP household?",