unit test + overall adjustments

macrocosm-os · p-ferreira · Mar 18, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
commit 7e035bf5e0894731a09e1b66159354261d081ba0
diff --git a/prompting/cleaners/__init__.py b/prompting/cleaners/__init__.py
@@ -0,0 +1 @@
+from .cleaner import CleanerPipeline
diff --git a/prompting/llms/hf_llm.py b/prompting/llms/hf_llm.py
@@ -68,14 +68,19 @@ def __init__(
         self.model = model_id
         self.device = device
         self.torch_dtype = torch_dtype
-
+        self.mock = mock
         self.pipeline = load_hf_pipeline(
             model_id, device, torch_dtype, mock, model_kwargs
         )
         self.tokenizer = self.pipeline.tokenizer
 
     def __call__(self, composed_prompt: str, **kwargs: dict) -> str:
-        return self.pipeline(composed_prompt, **kwargs)
+        if self.mock:
+            return self.pipeline(composed_prompt, **kwargs)
+
+        # Extract the generated text from the pipeline output
+        outputs = self.pipeline(composed_prompt, **kwargs)
+        return outputs[0]["generated_text"]
 
 
 class HuggingFaceLLM(BaseLLM):
@@ -135,10 +140,9 @@ def _make_prompt(self, messages: List[Dict[str, str]]):
     def forward(self, messages: List[Dict[str, str]]):
         composed_prompt = self._make_prompt(messages)
         # System prompt is composed in the prompt
-        outputs = self.llm_pipeline(
+        response = self.llm_pipeline(
             composed_prompt=composed_prompt, **self.model_kwargs
         )
-        response = outputs[0]["generated_text"]
 
         response = response.replace(composed_prompt, "").strip()
 

diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py
@@ -20,11 +20,12 @@
 from vllm import LLM, SamplingParams
 from prompting.cleaners.cleaner import CleanerPipeline
 from prompting.llms import BasePipeline, BaseLLM
+from prompting.mock import MockPipeline
 
 def load_vllm_pipeline(model_id, mock=False):
     """Loads the VLLM pipeline for the LLM, or a mock pipeline if mock=True"""
     if mock or model_id == "mock":
-        return None
+        return MockPipeline(model_id)
 
     return LLM(model=model_id)
 
@@ -37,7 +38,7 @@ def __init__(self, model_id, device=None, mock=False):
 
     def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str:        
         if self.mock:
-            return composed_prompt
+            return self.llm(composed_prompt, **model_kwargs)
 
         # Compose sampling params
         temperature = model_kwargs.get("temperature", 0.8)
@@ -120,7 +121,7 @@ def forward(self, messages: List[Dict[str, str]]):
 
 if __name__ == "__main__":
     # Example usage
-    llm_pipeline = vLLMPipeline(model_id="HuggingFaceH4/zephyr-7b-beta", mock=False)
+    llm_pipeline = vLLMPipeline(model_id="HuggingFaceH4/zephyr-7b-beta", mock=True)    
     llm = vLLM_LLM(llm_pipeline, system_prompt="You are a helpful AI assistant")
 
     message = "What is the capital of Texas?"

diff --git a/prompting/mock.py b/prompting/mock.py
@@ -56,8 +56,8 @@ def __init__(
     def __repr__(self):
         return f"{self.__class__.__name__}(phrase={self.model.phrase})"
 
-    def __call__(self, messages, **kwargs):
-        return self.forward(messages, **kwargs)
+    def __call__(self, composed_prompt, **kwargs):
+        return self.forward(composed_prompt, **kwargs)
 
     def forward(self, messages, **kwargs):
         output = self.model(messages)

diff --git a/tests/fixtures/cleaner.py b/tests/fixtures/cleaner.py
@@ -0,0 +1,7 @@
+from prompting.cleaners import CleanerPipeline
+
+DEFAULT_CLEANER_PIPELINE = CleanerPipeline([
+    dict(name="remove_quotes"),
+    dict(name="prune_ending"),
+    dict(name="remove_roles"),
+])
diff --git a/tests/fixtures/llm.py b/tests/fixtures/llm.py
@@ -1,3 +1,20 @@
 from prompting.mock import MockPipeline
+from prompting.llms import vLLM_LLM, HuggingFaceLLM, HuggingFacePipeline, vLLMPipeline
 
-LLM_PIPELINE = MockPipeline("This is just another test.")
+def mock_llm_pipeline():    
+    return MockPipeline("This is just another test.")
+
+def llms():    
+    pipeline = MockPipeline("This is just another test.")
+    llms = [
+        vLLM_LLM(pipeline, ''),
+        HuggingFaceLLM(pipeline, '')
+    ]
+    return llms
+
+def pipelines():
+    # Return pipeline types to be instantiated downstream
+    return [
+        HuggingFacePipeline,
+        vLLMPipeline
+    ]    
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -2,7 +2,7 @@
 from prompting.tasks import Task
 from prompting.agent import HumanAgent, create_persona
 
-from .fixtures.llm import LLM_PIPELINE
+from .fixtures.llm import mock_llm_pipeline
 from .fixtures.task import CONTEXTS, TASKS
 
 """
@@ -33,55 +33,55 @@
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_creation_with_dataset_context(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent is not None
 
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_contains_persona(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.persona is not None
 
 @pytest.mark.parametrize('task', TASKS)
 def test_user_can_set_agent_persona(task: Task):
     context = CONTEXTS[task]
     persona = create_persona()
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True, persona=persona)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True, persona=persona)
     assert agent.persona == persona
 
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_contains_task(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.task is not None
 
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_has_system_prompt(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.system_prompt is not None
 
 @pytest.mark.parametrize('task', TASKS)
 def test_user_can_set_agent_system_prompt_template(task: Task):
     context = CONTEXTS[task]
     system_template = "Today I am in a {mood} mood because i wanted {desc} related to {topic} ({subtopic}) in a {tone} tone. My intention is {goal}, but my problem is {query}"
 
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True, system_template=system_template)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True, system_template=system_template)
     assert agent.system_prompt_template
 
 
 @pytest.mark.parametrize('task', TASKS)
 @pytest.mark.parametrize('begin_conversation', [True, False])
 def test_agent_can_make_challenges(task: Task, begin_conversation: bool):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=begin_conversation)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=begin_conversation)
     if begin_conversation:
         assert agent.challenge is not None
     else:
@@ -90,30 +90,30 @@ def test_agent_can_make_challenges(task: Task, begin_conversation: bool):
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_progress_is_zero_on_init(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.progress == 0
 
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_progress_is_one_when_task_is_complete(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
     task.complete = True
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.progress == 1
 
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_finished_is_true_when_task_is_complete(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
     task.complete = True
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.finished == True
 
 @pytest.mark.parametrize('task', TASKS)
 def test_agent_finished_is_false_when_task_is_not_complete(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
     task.complete = False
-    agent = HumanAgent(llm_pipeline=LLM_PIPELINE, task=task, begin_conversation=True)
+    agent = HumanAgent(llm_pipeline=mock_llm_pipeline(), task=task, begin_conversation=True)
     assert agent.finished == False
diff --git a/tests/test_dataset_task_integration.py b/tests/test_dataset_task_integration.py
@@ -1,6 +1,6 @@
 import pytest
 from prompting.tasks import Task
-from .fixtures.llm import LLM_PIPELINE
+from .fixtures.llm import mock_llm_pipeline
 from .fixtures.task import CONTEXTS, TASKS
 
 
@@ -15,18 +15,18 @@
 @pytest.mark.parametrize('task', TASKS)
 def test_task_creation_with_dataset_context(task: Task):
     context = CONTEXTS[task]
-    task(llm_pipeline=LLM_PIPELINE, context=context)
+    task(llm_pipeline=mock_llm_pipeline(), context=context)
     assert task is not None
 
 @pytest.mark.parametrize('task', TASKS)
 def test_task_contains_query(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
     assert task.query is not None
 
 @pytest.mark.parametrize('task', TASKS)
 def test_task_contains_reference(task: Task):
     context = CONTEXTS[task]
-    task = task(llm_pipeline=LLM_PIPELINE, context=context)
+    task = task(llm_pipeline=mock_llm_pipeline(), context=context)
     assert task.reference is not None
 
diff --git a/tests/test_llm.py b/tests/test_llm.py
@@ -0,0 +1,59 @@
+# llm, input, expected output, cleaner
+
+##test_llm_forward
+    # test llm query (check messages, times)
+    # test llm query (calls forward, clean_response)
+
+import pytest
+from prompting.llms import BaseLLM, BasePipeline
+from prompting.cleaners import CleanerPipeline
+from prompting.mock import MockPipeline
+from .fixtures.llm import llms, pipelines
+from .fixtures.cleaner import DEFAULT_CLEANER_PIPELINE
+
+@pytest.mark.parametrize('input, expected_result, cleaner',
+    [('"I am a quote. User: I know you are. I am asking a question. What is th"', '"I am a quote. User: I know you are. I am asking a question. What is th"', None),
+    ('"I am a quote. User: I know you are. I am asking a question. What is th"', "I am a quote. I know you are. I am asking a question.", DEFAULT_CLEANER_PIPELINE)]
+)
+@pytest.mark.parametrize('llm', llms())
+def test_llm_clean_response(input: str, expected_result: str, cleaner: CleanerPipeline, llm: BaseLLM):
+    result = llm.clean_response(cleaner=cleaner, response=input)    
+    assert result == expected_result
+
+
+@pytest.mark.parametrize('pipeline', pipelines())
+def test_load_pipeline_mock(pipeline: BasePipeline):    
+    # Note that the model_id will be used internally as static response for the mock pipeline
+    model_id = "gpt2"
+    pipeline_instance = pipeline(model_id=model_id, device='cpu', mock=True)    
+    pipeline_message = pipeline_instance('')
+
+    mock_message = MockPipeline(model_id).forward(messages=[])    
+    assert mock_message == pipeline_message
+
+
+@pytest.mark.parametrize('llm', llms())
+def test_llm_query(llm: BaseLLM):
+    message = 'test'
+    llm.query(message)
+
+    # Assert that stateful operation where 3 messages are saved:
+    # the system prompt (on llm init), the user message and the assistant reply
+    assert len(llm.messages) == 3
+    assert len(llm.times) == 3
+
+    assert llm.messages[0]['role'] == 'system'    
+
+    assert llm.messages[1]['role'] == 'user'
+    assert llm.messages[1]['content'] == message
+
+    assert llm.messages[2]['role'] == 'assistant'
+
+@pytest.mark.parametrize('llm', llms())
+def test_llm_forward(llm: BaseLLM):
+    llm.forward(llm.messages)
+
+    # Assert stateless operation of the model with only history of system prompt
+    assert len(llm.messages) == 1
+    assert len(llm.times) == 1
+    assert llm.messages[0]['role'] == 'system'