vmware · gabrielgeorgiev1 · Feb 13, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 12, 2024
diff --git a/support/rag/qa-webservice/README.md b/support/rag/qa-webservice/README.md
@@ -0,0 +1,18 @@
+## RAG Question-answering Web Service
+
+This script allows for the quick deployment of a local service which can answer questions based on
+the context provided from a configured vector database.
+To run it, you need to first install the required dependencies:
+```
+pip install -r reqs.txt
+```
+
+Then, you need to start the web service:
+```
+uvicorn private_ai_api:app --reload
+```
+
+You can now query the web service through the following command:
+```
+curl http://127.0.0.1:8000/question/ -H "Content-Type: application/json" -d '{"question": "INPUT-QUESTION-HERE"}
+```
diff --git a/support/rag/qa-webservice/api_config.ini b/support/rag/qa-webservice/api_config.ini
@@ -0,0 +1,11 @@
+[db]
+postgres_dbname=
+postgres_dsn=
+postgres_host=
+postgres_password=
+postgres_user=
+
+[llm]
+auth_token=
+llm_host=
+llm_model=meta-llama/Llama-2-13b-chat-hf
diff --git a/support/rag/qa-webservice/private_ai_api.py b/support/rag/qa-webservice/private_ai_api.py
@@ -0,0 +1,109 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+import os
+import nltk
+import psycopg2
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from sentence_transformers import SentenceTransformer
+import configparser
+from pgvector.psycopg2 import register_vector
+import re
+import requests
+import json
+from openai import OpenAI
+
+# TODO: figure out how to make the parts configurable, i.e. embedding model could be configured here
+# but it would also need to be the same at the document ingestion step so that the similarity search
+# can work
+
+
+def clean_text(text):
+    """
+    TODO: Copied from the embed-ingest-job-example. Needs to be replaced by a more robust approach, something
+    off the shelf ideally.
+    """
+    text = text.lower()
+    # remove punctuation and special characters
+    text = re.sub(r"[^\w\s]", "", text)
+    # remove stopwords and lemmatize
+    stop_words = set(stopwords.words("english"))
+    lemmatizer = WordNetLemmatizer()
+    text = " ".join(
+        [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
+    )
+    return text
+
+
+def setup_nltk(temp_dir):
+    """
+    Set up NLTK by creating a temporary directory for NLTK data and downloading required resources.
+    """
+    from pathlib import Path 
+    nltk_data_path = Path(temp_dir) / "nltk_data"
+
+    nltk_data_path.mkdir(exist_ok=True)
+    nltk.data.path.append(str(nltk_data_path))
+    if os.path.isdir(nltk_data_path):
+        return
+
+    nltk.download("stopwords", download_dir=str(nltk_data_path))
+    nltk.download("wordnet", download_dir=str(nltk_data_path))
+
+
+class QuestionModel(BaseModel):
+    question: str
+
+
+app = FastAPI()
+
+
+@app.post("/question/")
+async def answer_question(question: QuestionModel):
+    setup_nltk(".")
+
+    config = configparser.ConfigParser()
+    config.read("api_config.ini")
+
+    # Embed the question 
+    model = SentenceTransformer("all-mpnet-base-v2")
+    embedding = model.encode(clean_text(question.question), show_progress_bar=True)
+
+    # DB connection
+    db_conn = psycopg2.connect(
+            dsn=config["db"]["postgres_dsn"],
+            dbname=config["db"]["postgres_dbname"],
+            user=config["db"]["postgres_user"],
+            password=config["db"]["postgres_password"],
+            host=config["db"]["postgres_host"]
+    )
+    register_vector(db_conn)
+    cur = db_conn.cursor()
+
+    # Similarity search
+    cur.execute('SELECT vdk_confluence_doc_metadata_example_2.data FROM vdk_confluence_doc_metadata_example_2 JOIN vdk_confluence_doc_embeddings_example_2 ON vdk_confluence_doc_metadata_example_2.id = vdk_confluence_doc_embeddings_example_2.id ORDER BY vdk_confluence_doc_embeddings_example_2.embedding <-> %s LIMIT 3', (embedding,))
+    res = cur.fetchall()
+
+    docs = "\n".join([doc[0] for doc in res])
+
+    # Build prompt
+    prompt = f"""Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
+    Context: {docs}
+    Question: {question.question}
+    Helpful Answer:"""
+
+    # Standard formatting for LLaMa 2
+    prompt = f"<s>[INST] <<SYS>>\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n<</SYS>>\n\n{prompt} [/INST] "
+
+    client = OpenAI(
+        api_key = config['llm']['auth_token'],
+        base_url = config['llm']['llm_host']
+    )
+
+    completion = client.completions.create(model=config['llm']['llm_model'], prompt=prompt, max_tokens=512, temperature=0, stream=True)
+
+    model_output = ""
+    for c in completion:
+        model_output += c.choices[0].text
+
+    return model_output
diff --git a/support/rag/qa-webservice/reqs.txt b/support/rag/qa-webservice/reqs.txt
@@ -0,0 +1,7 @@
+pgvector
+langchain
+"uvicorn[standard]"
+fastapi
+nltk
+sentence-transformers
+openai