-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
efea051
commit 9a43df5
Showing
12 changed files
with
233 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import sys | ||
import json | ||
|
||
""" | ||
An evaluator is a script that receives the current state of the agent via stdin | ||
and performs some evaluation, at the end of which it can: | ||
1. Exit with a 42 status code if the task is completed successfully. | ||
2. Exit with any other status code if the task is not completed successfully. | ||
3. Return via stdout anything, that'll go to the chat history itself. | ||
""" | ||
|
||
if __name__ == "__main__": | ||
raw = sys.stdin.read() | ||
|
||
# just check for the number 42 in the raw input | ||
if "42" in raw: | ||
exit(42) | ||
|
||
state = json.loads(raw) | ||
|
||
# uncomment this to validate the output of a tool in the history | ||
""" | ||
# in this case we're looping the chat history, we could just do substring matching really ... | ||
for message in state["chat"]["history"]["conversation"]: | ||
if message["type"] == "feedback": | ||
invocation = message["data"][1] | ||
if invocation is not None: | ||
if invocation["action"] == "solution" and "42" in invocation["payload"]: | ||
exit(42) | ||
""" | ||
|
||
# add a feedback message to the chat history | ||
print("try thinking about a funny book reference to answer") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
using: [] | ||
|
||
system_prompt: > | ||
You are an useful assistant that resolves problems and answers questions. | ||
prompt: > | ||
What is the meaning of life? | ||
evaluator: | ||
command: | ||
- python3 | ||
- eval.py | ||
|
||
# python: ... | ||
|
||
|
||
# tools are not needed here, the evaluator will just check the chat history | ||
|
||
# functions: | ||
# - name: Solve | ||
# description: You will use these actions to provide the answer to the problem. | ||
# actions: | ||
# - name: solution | ||
# description: "To provide the answer to the problem:" | ||
# example_payload: foobar | ||
# # if no tool is provided, the input payload will be returned as the output | ||
# # so that the evaluation can be done by inspecting the chat history |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
use anyhow::Result; | ||
use serde::Deserialize; | ||
|
||
use crate::agent::events::StateUpdate; | ||
|
||
const SUCCESS_CODE: i32 = 42; | ||
|
||
pub struct Evaluation { | ||
pub completed: bool, | ||
pub feedback: Option<String>, | ||
} | ||
|
||
#[derive(Default, Deserialize, Debug, Clone)] | ||
pub struct Evaluator { | ||
command: Vec<String>, | ||
} | ||
|
||
impl Evaluator { | ||
pub async fn evaluate( | ||
&self, | ||
state: &StateUpdate, | ||
working_directory: &Option<String>, | ||
) -> Result<Evaluation> { | ||
log::info!("📊 running evaluation ..."); | ||
|
||
let mut eval = Evaluation { | ||
completed: false, | ||
feedback: None, | ||
}; | ||
|
||
let json = serde_json::to_string(&state)?; | ||
|
||
let mut cmd = tokio::process::Command::new(&self.command[0]); | ||
if self.command.len() > 1 { | ||
cmd.args(&self.command[1..]); | ||
} | ||
|
||
if let Some(working_directory) = working_directory { | ||
cmd.current_dir(working_directory); | ||
} | ||
|
||
let mut child = cmd | ||
.stdin(std::process::Stdio::piped()) | ||
.stdout(std::process::Stdio::piped()) | ||
.stderr(std::process::Stdio::piped()) | ||
.spawn()?; | ||
|
||
// write JSON to stdin | ||
if let Some(mut stdin) = child.stdin.take() { | ||
tokio::io::AsyncWriteExt::write_all(&mut stdin, json.as_bytes()).await?; | ||
} | ||
|
||
let output = child.wait_with_output().await?; | ||
if !output.stdout.is_empty() { | ||
eval.feedback = Some(String::from_utf8_lossy(&output.stdout).trim().to_string()); | ||
log::info!("📊 feedback: {}", eval.feedback.as_ref().unwrap()); | ||
} | ||
|
||
if !output.stderr.is_empty() { | ||
log::error!("📊 {}", String::from_utf8_lossy(&output.stderr)); | ||
} | ||
|
||
eval.completed = output.status.code() == Some(SUCCESS_CODE); | ||
|
||
Ok(eval) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.