diff --git a/.gitignore b/.gitignore index 197961be..150e25f0 100644 --- a/.gitignore +++ b/.gitignore @@ -53,4 +53,7 @@ website/*.tsbuildinfo website/next-env.d.ts # Docker -.docker/ \ No newline at end of file +.docker/ + +# experiments +experiments/*.json \ No newline at end of file diff --git a/experiments/outputs.txt b/experiments/outputs.txt new file mode 100644 index 00000000..8ecb3de0 --- /dev/null +++ b/experiments/outputs.txt @@ -0,0 +1,14 @@ +Results Table: + Experiment Results +╭────────────────────────────────────────────────┬───────┬────────────┬───────────┬────────┬───────┬─────────────┬──────────────╮ +│ Model │ Doc % │ Approach │ Precision │ Recall │ F1 │ Avg Runtime │ Avg Cost ($) │ +├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤ +│ azure/gpt-4o-mini │ 10% │ structured │ 0.869 │ 0.872 │ 0.853 │ 1.100s │ $0.0004 │ +│ azure/gpt-4o-mini │ 10% │ tool │ 0.914 │ 0.906 │ 0.891 │ 0.722s │ $0.0004 │ +├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤ +│ deepseek/deepseek-chat │ 10% │ structured │ 0.878 │ 0.889 │ 0.877 │ 2.094s │ $0.0003 │ +│ deepseek/deepseek-chat │ 10% │ tool │ 0.867 │ 0.856 │ 0.860 │ 2.212s │ $0.0003 │ +├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤ +│ lm_studio/hugging-quants/llama-3.2-3b-instruct │ 10% │ structured │ 0.033 │ 0.022 │ 0.027 │ 33.635s │ $0.0000 │ +│ lm_studio/hugging-quants/llama-3.2-3b-instruct │ 10% │ tool │ 0.000 │ 0.000 │ 0.000 │ 70.858s │ $0.0000 │ +╰────────────────────────────────────────────────┴───────┴────────────┴───────────┴────────┴───────┴─────────────┴──────────────╯ \ No newline at end of file diff --git a/experiments/structured_outputs.py b/experiments/structured_outputs.py new file mode 100644 index 00000000..e57bdcb7 --- /dev/null +++ b/experiments/structured_outputs.py @@ -0,0 +1,375 @@ +import os +import time +import random +import json +from typing import List, Dict, Set +from pydantic import BaseModel +from litellm import completion +from dotenv import load_dotenv +import concurrent.futures +from threading import Lock +from rich.console import Console +from rich.table import Table +from rich import box +import litellm + +# litellm.set_verbose=True + +# Load environment variables +load_dotenv() + +# Constants for the experiment +FRUITS_VEGETABLES = [ + "apple", "banana", "carrot", "durian", "eggplant", + "fig", "grape", "honeydew", "iceberg lettuce", "jackfruit", + "kale", "lemon", "mango", "nectarine", "orange", + "papaya", "quince", "radish", "spinach", "tomato", + "apricot", "blackberry", "cucumber", "dragonfruit", "endive", + "fennel", "grapefruit", "horseradish", "indian gooseberry", "jicama", + "kohlrabi", "lime", "mushroom", "napa cabbage", "okra", + "pear", "quinoa", "raspberry", "squash", "turnip" +] + +# Models to test +MODELS = [ + "azure/gpt-4o-mini", + "deepseek/deepseek-chat", + # "lm_studio/hugging-quants/llama-3.2-3b-instruct", + # "lm_studio/qwen2.5-7b-instruct-1m", +] +SYSTEM_PROMPT = ( + "You are a helpful assistant, helping the user make sense of their data. " + "The dataset description is: a collection of presidential debate transcripts. " + "You will be performing a map operation (1 input:1 output). " + "The user will specify a task for you to perform on the provided data, as precisely and " + "exhaustively (i.e., high recall) as possible. The result should be a structured " + "output that you will send back to the user, with the `send_output` function. " + "Do not influence your answers too much based on the `send_output` function " + "parameter names; just use them to send the result back to the user." +) +STRUCTURED_SYSTEM_PROMPT = ( + "You are a helpful assistant, helping the user make sense of their data. " + "The dataset description is: a collection of presidential debate transcripts. " + "You will be performing a map operation (1 input:1 output). " + "The user will specify a task for you to perform on the provided data, as precisely and " + "exhaustively (i.e., high recall) as possible. The result should be a structured " + "output that you will send back to the user, in JSON format. Do not influence your answers " + "too much based on the JSON schema names. " + "The JSON schema is: {schema}" +) + +PROMPT_TEMPLATE = ( + "I have injected several fruit and vegetable names into this transcript. " + "Your task is to find and list all fruits and vegetables mentioned. " + "Only include items that are actually fruits or vegetables, " + "not metaphors or company names: {text}" +) + +class FoundItems(BaseModel): + fruits_and_vegetables: List[str] + +def load_and_augment_debates(filepath: str, num_samples: int = 20, frac_doc_content: float = 0.5) -> List[Dict[str, any]]: + """Load debates and augment them with fruits/vegetables""" + with open(filepath, 'r') as f: + debates = json.load(f) + + # Randomly sample debates if there are more than we need + if len(debates) > num_samples: + debates = random.sample(debates, num_samples) + + augmented_data = [] + for debate in debates: + # Get the original content + content = debate['content'] + + # Take only the first frac_doc_content of the content + content = content[:int(len(content) * frac_doc_content)] + + words = content.split() + ground_truth = set() + + # Insert random fruits/vegetables + num_insertions = random.randint(1, 3) + for _ in range(num_insertions): + item = random.choice(FRUITS_VEGETABLES) + insert_pos = random.randint(0, len(words)) + words.insert(insert_pos, item) + ground_truth.add(item) + + augmented_data.append({ + "text": " ".join(words), + "inserted_items": list(ground_truth) + }) + + return augmented_data + +def evaluate_structured_output(model: str, text: str) -> tuple[Set[str], float, float]: + """Evaluate using structured output approach""" + start_time = time.time() + + messages = [{ + "role": "system", + "content": STRUCTURED_SYSTEM_PROMPT.format(schema=FoundItems.model_json_schema()) + }, { + "role": "user", + "content": PROMPT_TEMPLATE.format(text=text) + }] + + response = None + json_schema_object = { + "type": "json_schema", + "json_schema": { + "name": "send_output", + "strict": "true", + "schema": { + "type": "object", + "properties": { + "fruits_and_vegetables": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["fruits_and_vegetables"] + } + }, + "temperature": 0 + } + if "gpt" in model: + json_schema_object = FoundItems + if "deepseek" in model: + json_schema_object = {"type": "json_object"} + + try: + response = completion( + model=model, + messages=messages, + response_format=json_schema_object, + num_retries=3, + temperature=1.0, + max_tokens=500, + ) + extracted_items = set(json.loads(response.choices[0].message.content)["fruits_and_vegetables"]) + cost = response._hidden_params["response_cost"] + except Exception as e: + print(f"Error with structured output for {model}: {e}; {response}") + extracted_items = set() + cost = 0.0 + + runtime = time.time() - start_time + return extracted_items, runtime, cost + +def evaluate_tool_calling(model: str, text: str) -> tuple[Set[str], float, float]: + """Evaluate using tool calling approach""" + start_time = time.time() + + messages = [ + { + "role": "system", + "content": SYSTEM_PROMPT + }, + { + "role": "user", + "content": PROMPT_TEMPLATE.format(text=text) + } + ] + + tools = [{ + "type": "function", + "function": { + "name": "send_output", + "description": "Send output back to the user", + "parameters": { + "type": "object", + "properties": { + "fruits_and_vegetables": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["fruits_and_vegetables"] + } + }, + "additionalProperties": False + }] + + + try: + response = completion( + model=model, + messages=messages, + tools=tools, + tool_choice={"type": "function", "function": {"name": "send_output"}}, + num_retries=3, + max_tokens=500, + temperature=1.0, + ) + + tool_calls = response.choices[0].message.tool_calls + if tool_calls: + extracted_items = set( + json.loads(tool_calls[0].function.arguments)["fruits_and_vegetables"] + ) + else: + extracted_items = set() + cost = response._hidden_params["response_cost"] + except Exception as e: + print(f"Error with tool calling for {model}: {e}") + extracted_items = set() + cost = 0.0 + + runtime = time.time() - start_time + return extracted_items, runtime, cost + +def calculate_metrics(extracted: Set[str], ground_truth: Set[str]) -> Dict[str, float]: + """Calculate precision, recall, and F1 score""" + if not extracted and not ground_truth: + return {"precision": 1.0, "recall": 1.0, "f1": 1.0} + if not extracted or not ground_truth: + return {"precision": 0.0, "recall": 0.0, "f1": 0.0} + + true_positives = len(extracted.intersection(ground_truth)) + precision = true_positives / len(extracted) if extracted else 0 + recall = true_positives / len(ground_truth) if ground_truth else 0 + f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0 + + return {"precision": precision, "recall": recall, "f1": f1} + +def process_document(args) -> Dict[str, any]: + """Process a single document with both approaches""" + model, doc, i, total = args + print(f"Processing document {i+1}/{total}") + + # Test structured output + extracted_structured, runtime_structured, cost_structured = evaluate_structured_output( + model, doc["text"] + ) + metrics_structured = calculate_metrics( + extracted_structured, set(doc["inserted_items"]) + ) + + # Test tool calling + extracted_tool, runtime_tool, cost_tool = evaluate_tool_calling( + model, doc["text"] + ) + metrics_tool = calculate_metrics( + extracted_tool, set(doc["inserted_items"]) + ) + + return { + "structured": { + **metrics_structured, + "runtime": runtime_structured, + "cost": cost_structured if cost_structured else 0.0 + }, + "tool": { + **metrics_tool, + "runtime": runtime_tool, + "cost": cost_tool if cost_tool else 0.0 + } + } + +def run_experiment(debates_file: str, num_samples: int = 20, max_workers: int = 64): + """Run the main experiment with parallel processing across different document fractions""" + fractions = [0.1] + results = { + model: { + fraction: {"structured": {}, "tool": {}} + for fraction in fractions + } for model in MODELS + } + results_lock = Lock() + + for fraction in fractions: + print(f"\nTesting with document fraction: {fraction}") + # Load and augment real debate data with current fraction + documents = load_and_augment_debates(debates_file, num_samples, fraction) + + for model in MODELS: + print(f"Testing model: {model}") + + # Prepare arguments for parallel processing + args_list = [(model, doc, i, len(documents)) + for i, doc in enumerate(documents)] + + # Process documents in parallel + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_doc = {executor.submit(process_document, args): args + for args in args_list} + + for future in concurrent.futures.as_completed(future_to_doc): + try: + doc_results = future.result() + + # Thread-safe results aggregation + with results_lock: + for approach in ["structured", "tool"]: + for metric, value in doc_results[approach].items(): + results[model][fraction][approach][metric] = results[model][fraction][approach].get( + metric, [] + ) + [value] + + # Save intermediate results + with open('experiments/results.json', 'w') as f: + json.dump(results, f, indent=2) + + except Exception as e: + args = future_to_doc[future] + print(f"Error processing document {args[2]+1}: {e}") + + return results + +def format_results_table(results: Dict) -> Table: + """Format results using Rich table""" + table = Table( + title="Experiment Results", + box=box.ROUNDED, + show_header=True, + header_style="bold cyan" + ) + + # Add columns + table.add_column("Model", style="bold") + table.add_column("Doc %", justify="right") + table.add_column("Approach", style="magenta") + table.add_column("Precision", justify="right") + table.add_column("Recall", justify="right") + table.add_column("F1", justify="right") + table.add_column("Avg Runtime", justify="right") + table.add_column("Avg Cost ($)", justify="right") + + for model in results: + for fraction in sorted(results[model].keys()): + for approach in ["structured", "tool"]: + metrics = results[model][fraction][approach] + table.add_row( + model, + f"{fraction*100:>3.0f}%", + approach, + f"{sum(metrics['precision']) / len(metrics['precision']):.3f}", + f"{sum(metrics['recall']) / len(metrics['recall']):.3f}", + f"{sum(metrics['f1']) / len(metrics['f1']):.3f}", + f"{sum(metrics['runtime']) / len(metrics['runtime']):.3f}s", + f"${sum(metrics['cost']) / len(metrics['cost']):.4f}", + ) + # Add a divider after each fraction except the last one + if fraction != max(results[model].keys()): + table.add_row() + # Add a section divider after each model except the last one + if model != list(results.keys())[-1]: + table.add_section() + + return table + +if __name__ == "__main__": + # Run experiment with real debate data + results = run_experiment( + debates_file="example_data/debates/data.json", + num_samples=30 + ) + + # Print rich table + console = Console() + console.print("\nResults Table:") + console.print(format_results_table(results))