From a00f7237542b734e239de539bb3ee99271542ab7 Mon Sep 17 00:00:00 2001
From: tcm390 <60634884+tcm390@users.noreply.github.com>
Date: Tue, 21 Jan 2025 09:42:30 +0800
Subject: [PATCH] feat(x spaces): Don't wait for mute, wait for silence (#2576)

* use getSetting to get the api key

* use debounce for transcription processing instead of unmute

* prevent transcription while previous transcription is processing

* rename

* rename

* make sure the flag would be reset

* clean all buffers when starting to processing audio

* use one timeout instead so only last user audio would be handled

* implement interruption

* process reply message by context and state

* create template file

* rename

* add should response

* use constant

* clean code

* replace console log

* add missing type

* add optional for twitterSpaces
---
 .../src/plugins/SttTtsSpacesPlugin.ts         | 519 +++++++++++++-----
 .../client-twitter/src/plugins/templates.ts   |  90 +++
 packages/client-twitter/src/spaces.ts         |  39 +-
 packages/core/src/types.ts                    |  20 +-
 4 files changed, 512 insertions(+), 156 deletions(-)
 create mode 100644 packages/client-twitter/src/plugins/templates.ts

diff --git a/packages/client-twitter/src/plugins/SttTtsSpacesPlugin.ts b/packages/client-twitter/src/plugins/SttTtsSpacesPlugin.ts
index 767f96fad42..19125faa920 100644
--- a/packages/client-twitter/src/plugins/SttTtsSpacesPlugin.ts
+++ b/packages/client-twitter/src/plugins/SttTtsSpacesPlugin.ts
@@ -1,19 +1,43 @@
 // src/plugins/SttTtsPlugin.ts
 
 import { spawn } from "child_process";
-import { type ITranscriptionService, elizaLogger } from "@elizaos/core";
-import type { Space, JanusClient, AudioDataWithUser } from "agent-twitter-client";
-import type { Plugin } from "@elizaos/core";
+import {
+    type ITranscriptionService,
+    elizaLogger,
+    stringToUuid,
+    composeContext,
+    getEmbeddingZeroVector,
+    generateMessageResponse,
+    ModelClass,
+    Content,
+    IAgentRuntime,
+    Memory,
+    Plugin,
+    UUID,
+    State,
+    composeRandomUser,
+    generateShouldRespond,
+} from "@elizaos/core";
+import type {
+    Space,
+    JanusClient,
+    AudioDataWithUser,
+} from "agent-twitter-client";
+import { ClientBase } from "../base";
+import {
+    twitterVoiceHandlerTemplate,
+    twitterShouldRespondTemplate,
+} from "./templates";
 
 interface PluginConfig {
-    openAiApiKey?: string; // for STT & ChatGPT
+    runtime: IAgentRuntime;
+    client: ClientBase;
+    spaceId: string;
     elevenLabsApiKey?: string; // for TTS
     sttLanguage?: string; // e.g. "en" for Whisper
-    gptModel?: string; // e.g. "gpt-3.5-turbo"
     silenceThreshold?: number; // amplitude threshold for ignoring silence
     voiceId?: string; // specify which ElevenLabs voice to use
     elevenLabsModel?: string; // e.g. "eleven_monolingual_v1"
-    systemPrompt?: string; // ex. "You are a helpful AI assistant"
     chatContext?: Array<{
         role: "system" | "user" | "assistant";
         content: string;
@@ -21,6 +45,10 @@ interface PluginConfig {
     transcriptionService: ITranscriptionService;
 }
 
+const VOLUME_WINDOW_SIZE = 100;
+const SPEAKING_THRESHOLD = 0.05;
+const SILENCE_DETECTION_THRESHOLD_MS = 1000; // 1-second silence threshold
+
 /**
  * MVP plugin for speech-to-text (OpenAI) + conversation + TTS (ElevenLabs)
  * Approach:
@@ -30,17 +58,17 @@ interface PluginConfig {
 export class SttTtsPlugin implements Plugin {
     name = "SttTtsPlugin";
     description = "Speech-to-text (OpenAI) + conversation + TTS (ElevenLabs)";
+    private runtime: IAgentRuntime;
+    private client: ClientBase;
+    private spaceId: string;
 
     private space?: Space;
     private janus?: JanusClient;
 
-    private openAiApiKey?: string;
     private elevenLabsApiKey?: string;
 
-    private gptModel = "gpt-3.5-turbo";
     private voiceId = "21m00Tcm4TlvDq8ikWAM";
     private elevenLabsModel = "eleven_monolingual_v1";
-    private systemPrompt = "You are a helpful AI assistant.";
     private chatContext: Array<{
         role: "system" | "user" | "assistant";
         content: string;
@@ -53,11 +81,6 @@ export class SttTtsPlugin implements Plugin {
      */
     private pcmBuffers = new Map<string, Int16Array[]>();
 
-    /**
-     * Track mute states: userId => boolean (true=unmuted)
-     */
-    private speakerUnmuted = new Map<string, boolean>();
-
     /**
      * For ignoring near-silence frames (if amplitude < threshold)
      */
@@ -66,6 +89,11 @@ export class SttTtsPlugin implements Plugin {
     // TTS queue for sequentially speaking
     private ttsQueue: string[] = [];
     private isSpeaking = false;
+    private isProcessingAudio = false;
+
+    private userSpeakingTimer: NodeJS.Timeout | null = null;
+    private volumeBuffers: Map<string, number[]>;
+    private ttsAbortController: AbortController | null = null;
 
     onAttach(_space: Space) {
         elizaLogger.log("[SttTtsPlugin] onAttach => space was attached");
@@ -82,10 +110,11 @@ export class SttTtsPlugin implements Plugin {
             | undefined;
 
         const config = params.pluginConfig as PluginConfig;
-        this.openAiApiKey = config?.openAiApiKey;
+        this.runtime = config?.runtime;
+        this.client = config?.client;
+        this.spaceId = config?.spaceId;
         this.elevenLabsApiKey = config?.elevenLabsApiKey;
         this.transcriptionService = config.transcriptionService;
-        if (config?.gptModel) this.gptModel = config.gptModel;
         if (typeof config?.silenceThreshold === "number") {
             this.silenceThreshold = config.silenceThreshold;
         }
@@ -95,45 +124,21 @@ export class SttTtsPlugin implements Plugin {
         if (config?.elevenLabsModel) {
             this.elevenLabsModel = config.elevenLabsModel;
         }
-        if (config?.systemPrompt) {
-            this.systemPrompt = config.systemPrompt;
-        }
         if (config?.chatContext) {
             this.chatContext = config.chatContext;
         }
         elizaLogger.log("[SttTtsPlugin] Plugin config =>", config);
 
-        // Listen for mute events
-        this.space.on(
-            "muteStateChanged",
-            (evt: { userId: string; muted: boolean }) => {
-                elizaLogger.log(
-                    "[SttTtsPlugin] Speaker muteStateChanged =>",
-                    evt
-                );
-                if (evt.muted) {
-                    this.handleMute(evt.userId).catch((err) =>
-                        elizaLogger.error(
-                            "[SttTtsPlugin] handleMute error =>",
-                            err
-                        )
-                    );
-                } else {
-                    this.speakerUnmuted.set(evt.userId, true);
-                    if (!this.pcmBuffers.has(evt.userId)) {
-                        this.pcmBuffers.set(evt.userId, []);
-                    }
-                }
-            }
-        );
+        this.volumeBuffers = new Map<string, number[]>();
     }
 
     /**
      * Called whenever we receive PCM from a speaker
      */
     onAudioData(data: AudioDataWithUser): void {
-        if (!this.speakerUnmuted.get(data.userId)) return;
-
+        if (this.isProcessingAudio) {
+            return;
+        }
         let maxVal = 0;
         for (let i = 0; i < data.samples.length; i++) {
             const val = Math.abs(data.samples[i]);
@@ -143,12 +148,62 @@ export class SttTtsPlugin implements Plugin {
             return;
         }
 
+        if (this.userSpeakingTimer) {
+            clearTimeout(this.userSpeakingTimer);
+        }
+
         let arr = this.pcmBuffers.get(data.userId);
         if (!arr) {
             arr = [];
             this.pcmBuffers.set(data.userId, arr);
         }
         arr.push(data.samples);
+
+        if (!this.isSpeaking) {
+            this.userSpeakingTimer = setTimeout(() => {
+                elizaLogger.log(
+                    "[SttTtsPlugin] start processing audio for user =>",
+                    data.userId
+                );
+                this.userSpeakingTimer = null;
+                this.processAudio(data.userId).catch((err) =>
+                    elizaLogger.error(
+                        "[SttTtsPlugin] handleSilence error =>",
+                        err
+                    )
+                );
+            }, SILENCE_DETECTION_THRESHOLD_MS);
+        } else {
+            // check interruption
+            let volumeBuffer = this.volumeBuffers.get(data.userId);
+            if (!volumeBuffer) {
+                volumeBuffer = [];
+                this.volumeBuffers.set(data.userId, volumeBuffer);
+            }
+            const samples = new Int16Array(
+                data.samples.buffer,
+                data.samples.byteOffset,
+                data.samples.length / 2
+            );
+            const maxAmplitude = Math.max(...samples.map(Math.abs)) / 32768;
+            volumeBuffer.push(maxAmplitude);
+
+            if (volumeBuffer.length > VOLUME_WINDOW_SIZE) {
+                volumeBuffer.shift();
+            }
+            const avgVolume =
+                volumeBuffer.reduce((sum, v) => sum + v, 0) /
+                VOLUME_WINDOW_SIZE;
+
+            if (avgVolume > SPEAKING_THRESHOLD) {
+                volumeBuffer.length = 0;
+                if (this.ttsAbortController) {
+                    this.ttsAbortController.abort();
+                    this.isSpeaking = false;
+                    elizaLogger.log("[SttTtsPlugin] TTS playback interrupted");
+                }
+            }
+        }
     }
 
     // /src/sttTtsPlugin.ts
@@ -203,57 +258,84 @@ export class SttTtsPlugin implements Plugin {
     }
 
     /**
-     * On speaker mute => flush STT => GPT => TTS => push to Janus
+     * On speaker silence => flush STT => GPT => TTS => push to Janus
      */
-    private async handleMute(userId: string): Promise<void> {
-        this.speakerUnmuted.set(userId, false);
-        const chunks = this.pcmBuffers.get(userId) || [];
-        this.pcmBuffers.set(userId, []);
-
-        if (!chunks.length) {
-            elizaLogger.warn(
-                "[SttTtsPlugin] No audio chunks for user =>",
-                userId
-            );
+    private async processAudio(userId: UUID): Promise<void> {
+        if (this.isProcessingAudio) {
             return;
         }
-        elizaLogger.log(
-            `[SttTtsPlugin] Flushing STT buffer for user=${userId}, chunks=${chunks.length}`
-        );
+        this.isProcessingAudio = true;
+        try {
+            elizaLogger.log(
+                "[SttTtsPlugin] Starting audio processing for user:",
+                userId
+            );
+            const chunks = this.pcmBuffers.get(userId) || [];
+            this.pcmBuffers.clear();
 
-        const totalLen = chunks.reduce((acc, c) => acc + c.length, 0);
-        const merged = new Int16Array(totalLen);
-        let offset = 0;
-        for (const c of chunks) {
-            merged.set(c, offset);
-            offset += c.length;
-        }
+            if (!chunks.length) {
+                elizaLogger.warn(
+                    "[SttTtsPlugin] No audio chunks for user =>",
+                    userId
+                );
+                return;
+            }
+            elizaLogger.log(
+                `[SttTtsPlugin] Flushing STT buffer for user=${userId}, chunks=${chunks.length}`
+            );
 
-        // Convert PCM to WAV for STT
-        const wavBuffer = await this.convertPcmToWavInMemory(merged, 48000);
+            const totalLen = chunks.reduce((acc, c) => acc + c.length, 0);
+            const merged = new Int16Array(totalLen);
+            let offset = 0;
+            for (const c of chunks) {
+                merged.set(c, offset);
+                offset += c.length;
+            }
 
-        // Whisper STT
-        const sttText = await this.transcriptionService.transcribe(wavBuffer);
+            // Convert PCM to WAV for STT
+            const wavBuffer = await this.convertPcmToWavInMemory(merged, 48000);
 
-        if (!sttText || !sttText.trim()) {
-            elizaLogger.warn(
-                "[SttTtsPlugin] No speech recognized for user =>",
-                userId
+            // Whisper STT
+            const sttText = await this.transcriptionService.transcribe(
+                wavBuffer
             );
-            return;
-        }
-        elizaLogger.log(
-            `[SttTtsPlugin] STT => user=${userId}, text="${sttText}"`
-        );
 
-        // GPT answer
-        const replyText = await this.askChatGPT(sttText);
-        elizaLogger.log(
-            `[SttTtsPlugin] GPT => user=${userId}, reply="${replyText}"`
-        );
+            elizaLogger.log(
+                `[SttTtsPlugin] Transcription result: "${sttText}"`
+            );
 
-        // Use the standard speak method with queue
-        await this.speakText(replyText);
+            if (!sttText || !sttText.trim()) {
+                elizaLogger.warn(
+                    "[SttTtsPlugin] No speech recognized for user =>",
+                    userId
+                );
+                return;
+            }
+            elizaLogger.log(
+                `[SttTtsPlugin] STT => user=${userId}, text="${sttText}"`
+            );
+
+            // Get response
+            const replyText = await this.handleUserMessage(sttText, userId);
+            if (!replyText || !replyText.length || !replyText.trim()) {
+                elizaLogger.warn(
+                    "[SttTtsPlugin] No replyText for user =>",
+                    userId
+                );
+                return;
+            }
+            elizaLogger.log(
+                `[SttTtsPlugin] user=${userId}, reply="${replyText}"`
+            );
+            this.isProcessingAudio = false;
+            this.volumeBuffers.clear();
+            // Use the standard speak method with queue
+            await this.speakText(replyText);
+        } catch (error) {
+            elizaLogger.error("[SttTtsPlugin] processAudio error =>", error);
+        } finally {
+            this.isProcessingAudio = false;
+        }
     }
 
     /**
@@ -280,55 +362,246 @@ export class SttTtsPlugin implements Plugin {
             const text = this.ttsQueue.shift();
             if (!text) continue;
 
+            this.ttsAbortController = new AbortController();
+            const { signal } = this.ttsAbortController;
+
             try {
                 const ttsAudio = await this.elevenLabsTts(text);
                 const pcm = await this.convertMp3ToPcm(ttsAudio, 48000);
+                if (signal.aborted) {
+                    elizaLogger.log(
+                        "[SttTtsPlugin] TTS interrupted before streaming"
+                    );
+                    return;
+                }
                 await this.streamToJanus(pcm, 48000);
+                if (signal.aborted) {
+                    elizaLogger.log(
+                        "[SttTtsPlugin] TTS interrupted after streaming"
+                    );
+                    return;
+                }
             } catch (err) {
                 elizaLogger.error("[SttTtsPlugin] TTS streaming error =>", err);
+            } finally {
+                // Clean up the AbortController
+                this.ttsAbortController = null;
             }
         }
         this.isSpeaking = false;
     }
 
     /**
-     * Simple ChatGPT call
+     * Handle User Message
      */
-    private async askChatGPT(userText: string): Promise<string> {
-        if (!this.openAiApiKey) {
-            throw new Error("[SttTtsPlugin] No OpenAI API key for ChatGPT");
+    private async handleUserMessage(
+        userText: string,
+        userId: UUID
+    ): Promise<string> {
+        await this.runtime.ensureUserExists(
+            this.runtime.agentId,
+            this.client.profile.username,
+            this.runtime.character.name,
+            "twitter"
+        );
+
+        const roomId = stringToUuid("twitter_generate_room-" + this.spaceId);
+        let state = await this.runtime.composeState(
+            {
+                agentId: this.runtime.agentId,
+                content: { text: userText, source: "twitter" },
+                userId,
+                roomId,
+            },
+            {
+                twitterUserName: this.client.profile.username,
+                agentName: this.runtime.character.name,
+            }
+        );
+
+        const memory = {
+            id: stringToUuid(roomId + "-voice-message-" + Date.now()),
+            agentId: this.runtime.agentId,
+            content: {
+                text: userText,
+                source: "twitter",
+            },
+            userId,
+            roomId,
+            embedding: getEmbeddingZeroVector(),
+            createdAt: Date.now(),
+        };
+
+        await this.runtime.messageManager.createMemory(memory);
+
+        state = await this.runtime.updateRecentMessageState(state);
+
+        const shouldIgnore = await this._shouldIgnore(memory);
+
+        if (shouldIgnore) {
+            return "";
         }
-        const url = "https://api.openai.com/v1/chat/completions";
-        const messages = [
-            { role: "system", content: this.systemPrompt },
-            ...this.chatContext,
-            { role: "user", content: userText },
-        ];
 
-        const resp = await fetch(url, {
-            method: "POST",
-            headers: {
-                Authorization: `Bearer ${this.openAiApiKey}`,
-                "Content-Type": "application/json",
+        const shouldRespond = await this._shouldRespond(userText, state);
+
+        if (!shouldRespond) {
+            return "";
+        }
+
+        const context = composeContext({
+            state,
+            template:
+                this.runtime.character.templates?.twitterVoiceHandlerTemplate ||
+                this.runtime.character.templates?.messageHandlerTemplate ||
+                twitterVoiceHandlerTemplate,
+        });
+
+        const responseContent = await this._generateResponse(memory, context);
+
+        const responseMemory: Memory = {
+            id: stringToUuid(memory.id + "-voice-response-" + Date.now()),
+            agentId: this.runtime.agentId,
+            userId: this.runtime.agentId,
+            content: {
+                ...responseContent,
+                user: this.runtime.character.name,
+                inReplyTo: memory.id,
             },
-            body: JSON.stringify({
-                model: this.gptModel,
-                messages,
-            }),
+            roomId,
+            embedding: getEmbeddingZeroVector(),
+        };
+
+        const reply = responseMemory.content.text?.trim();
+        if (reply) {
+            await this.runtime.messageManager.createMemory(responseMemory);
+        }
+
+        return reply;
+    }
+
+    private async _generateResponse(
+        message: Memory,
+        context: string
+    ): Promise<Content> {
+        const { userId, roomId } = message;
+
+        const response = await generateMessageResponse({
+            runtime: this.runtime,
+            context,
+            modelClass: ModelClass.SMALL,
         });
 
-        if (!resp.ok) {
-            const errText = await resp.text();
-            throw new Error(
-                `[SttTtsPlugin] ChatGPT error => ${resp.status} ${errText}`
+        response.source = "discord";
+
+        if (!response) {
+            elizaLogger.error(
+                "[SttTtsPlugin] No response from generateMessageResponse"
             );
+            return;
+        }
+
+        await this.runtime.databaseAdapter.log({
+            body: { message, context, response },
+            userId: userId,
+            roomId,
+            type: "response",
+        });
+
+        return response;
+    }
+
+    private async _shouldIgnore(message: Memory): Promise<boolean> {
+        elizaLogger.debug("message.content: ", message.content);
+        // if the message is 3 characters or less, ignore it
+        if ((message.content as Content).text.length < 3) {
+            return true;
+        }
+
+        const loseInterestWords = [
+            // telling the bot to stop talking
+            "shut up",
+            "stop",
+            "dont talk",
+            "silence",
+            "stop talking",
+            "be quiet",
+            "hush",
+            "stfu",
+            "stupid bot",
+            "dumb bot",
+
+            // offensive words
+            "fuck",
+            "shit",
+            "damn",
+            "suck",
+            "dick",
+            "cock",
+            "sex",
+            "sexy",
+        ];
+        if (
+            (message.content as Content).text.length < 50 &&
+            loseInterestWords.some((word) =>
+                (message.content as Content).text?.toLowerCase().includes(word)
+            )
+        ) {
+            return true;
+        }
+
+        const ignoreWords = ["k", "ok", "bye", "lol", "nm", "uh"];
+        if (
+            (message.content as Content).text?.length < 8 &&
+            ignoreWords.some((word) =>
+                (message.content as Content).text?.toLowerCase().includes(word)
+            )
+        ) {
+            return true;
+        }
+
+        return false;
+    }
+
+    private async _shouldRespond(
+        message: string,
+        state: State
+    ): Promise<boolean> {
+        const lowerMessage = message.toLowerCase();
+        const characterName = this.runtime.character.name.toLowerCase();
+
+        if (lowerMessage.includes(characterName)) {
+            return true;
         }
 
-        const json = await resp.json();
-        const reply = json.choices?.[0]?.message?.content || "";
-        this.chatContext.push({ role: "user", content: userText });
-        this.chatContext.push({ role: "assistant", content: reply });
-        return reply.trim();
+        // If none of the above conditions are met, use the generateText to decide
+        const shouldRespondContext = composeContext({
+            state,
+            template:
+                this.runtime.character.templates
+                    ?.twitterShouldRespondTemplate ||
+                this.runtime.character.templates?.shouldRespondTemplate ||
+                composeRandomUser(twitterShouldRespondTemplate, 2),
+        });
+
+        const response = await generateShouldRespond({
+            runtime: this.runtime,
+            context: shouldRespondContext,
+            modelClass: ModelClass.SMALL,
+        });
+
+        if (response === "RESPOND") {
+            return true;
+        } else if (response === "IGNORE") {
+            return false;
+        } else if (response === "STOP") {
+            return false;
+        } else {
+            elizaLogger.error(
+                "Invalid response from response generateText:",
+                response
+            );
+            return false;
+        }
     }
 
     /**
@@ -422,6 +695,10 @@ export class SttTtsPlugin implements Plugin {
             offset + FRAME_SIZE <= samples.length;
             offset += FRAME_SIZE
         ) {
+            if (this.ttsAbortController?.signal.aborted) {
+                elizaLogger.log("[SttTtsPlugin] streamToJanus interrupted");
+                return;
+            }
             const frame = new Int16Array(FRAME_SIZE);
             frame.set(samples.subarray(offset, offset + FRAME_SIZE));
             this.janus?.pushLocalAudio(frame, sampleRate, 1);
@@ -431,19 +708,6 @@ export class SttTtsPlugin implements Plugin {
         }
     }
 
-    public setSystemPrompt(prompt: string) {
-        this.systemPrompt = prompt;
-        elizaLogger.log("[SttTtsPlugin] setSystemPrompt =>", prompt);
-    }
-
-    /**
-     * Change the GPT model at runtime (e.g. "gpt-4", "gpt-3.5-turbo", etc.).
-     */
-    public setGptModel(model: string) {
-        this.gptModel = model;
-        elizaLogger.log("[SttTtsPlugin] setGptModel =>", model);
-    }
-
     /**
      * Add a message (system, user or assistant) to the chat context.
      * E.g. to store conversation history or inject a persona.
@@ -466,8 +730,9 @@ export class SttTtsPlugin implements Plugin {
     cleanup(): void {
         elizaLogger.log("[SttTtsPlugin] cleanup => releasing resources");
         this.pcmBuffers.clear();
-        this.speakerUnmuted.clear();
+        this.userSpeakingTimer = null;
         this.ttsQueue = [];
         this.isSpeaking = false;
+        this.volumeBuffers.clear();
     }
 }
diff --git a/packages/client-twitter/src/plugins/templates.ts b/packages/client-twitter/src/plugins/templates.ts
new file mode 100644
index 00000000000..05882425c06
--- /dev/null
+++ b/packages/client-twitter/src/plugins/templates.ts
@@ -0,0 +1,90 @@
+import { messageCompletionFooter, shouldRespondFooter } from "@elizaos/core";
+
+export const twitterShouldRespondTemplate =
+    `# Task: Decide if {{agentName}} should respond.
+About {{agentName}}:
+{{bio}}
+
+# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP".
+
+# RESPONSE EXAMPLES
+{{user1}}: I just saw a really great movie
+{{user2}}: Oh? Which movie?
+Result: [IGNORE]
+
+{{agentName}}: Oh, this is my favorite scene
+{{user1}}: sick
+{{user2}}: wait, why is it your favorite scene
+Result: [RESPOND]
+
+{{user1}}: stfu bot
+Result: [STOP]
+
+{{user1}}: Hey {{agent}}, can you help me with something
+Result: [RESPOND]
+
+{{user1}}: {{agentName}} stfu plz
+Result: [STOP]
+
+{{user1}}: i need help
+{{agentName}}: how can I help you?
+{{user1}}: no. i need help from someone else
+Result: [IGNORE]
+
+{{user1}}: Hey {{agent}}, can I ask you a question
+{{agentName}}: Sure, what is it
+{{user1}}: can you ask claude to create a basic react module that demonstrates a counter
+Result: [RESPOND]
+
+{{user1}}: {{agentName}} can you tell me a story
+{{user1}}: about a girl named elara
+{{agentName}}: Sure.
+{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara.
+{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane.
+{{user1}}: I'm loving it, keep going
+Result: [RESPOND]
+
+{{user1}}: {{agentName}} stop responding plz
+Result: [STOP]
+
+{{user1}}: okay, i want to test something. can you say marco?
+{{agentName}}: marco
+{{user1}}: great. okay, now do it again
+Result: [RESPOND]
+
+Response options are [RESPOND], [IGNORE] and [STOP].
+
+{{agentName}} is in a room with other users and is very worried about being annoying and saying too much.
+Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background.
+If a message is not interesting or relevant, respond with [IGNORE]
+Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information.
+If a user asks {{agentName}} to be quiet, respond with [STOP]
+If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP]
+
+IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE].
+If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND].
+
+{{recentMessages}}
+
+# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else.
+` + shouldRespondFooter;
+
+export const twitterVoiceHandlerTemplate =
+    `# Task: Generate conversational voice dialog for {{agentName}}.
+    About {{agentName}}:
+    {{bio}}
+
+    # Attachments
+    {{attachments}}
+
+    # Capabilities
+    Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section.
+
+    {{actions}}
+
+    {{messageDirections}}
+
+    {{recentMessages}}
+
+    # Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}}
+    ` + messageCompletionFooter;
diff --git a/packages/client-twitter/src/spaces.ts b/packages/client-twitter/src/spaces.ts
index 44f679c3298..d9c22e9dd77 100644
--- a/packages/client-twitter/src/spaces.ts
+++ b/packages/client-twitter/src/spaces.ts
@@ -6,6 +6,7 @@ import {
     ModelClass,
     ServiceType,
     type ITranscriptionService,
+    TwitterSpaceDecisionOptions,
 } from "@elizaos/core";
 import type { ClientBase } from "./base";
 import {
@@ -18,24 +19,6 @@ import {
 } from "agent-twitter-client";
 import { SttTtsPlugin } from "./plugins/SttTtsSpacesPlugin.ts";
 
-interface SpaceDecisionOptions {
-    maxSpeakers?: number;
-    topics?: string[];
-    typicalDurationMinutes?: number;
-    idleKickTimeoutMs?: number;
-    minIntervalBetweenSpacesMinutes?: number;
-    businessHoursOnly?: boolean;
-    randomChance?: number;
-    enableIdleMonitor?: boolean;
-    enableSttTts?: boolean;
-    enableRecording?: boolean;
-    voiceId?: string;
-    sttLanguage?: string;
-    gptModel?: string;
-    systemPrompt?: string;
-    speakerMaxDurationMs?: number;
-}
-
 interface CurrentSpeakerState {
     userId: string;
     sessionUUID: string;
@@ -134,6 +117,7 @@ Example:
  * Main class: manage a Twitter Space with N speakers max, speaker queue, filler messages, etc.
  */
 export class TwitterSpaceClient {
+    private runtime: IAgentRuntime;
     private client: ClientBase;
     private scraper: Scraper;
     private isSpaceRunning = false;
@@ -150,11 +134,12 @@ export class TwitterSpaceClient {
     private activeSpeakers: CurrentSpeakerState[] = [];
     private speakerQueue: SpeakerRequest[] = [];
 
-    private decisionOptions: SpaceDecisionOptions;
+    private decisionOptions: TwitterSpaceDecisionOptions;
 
     constructor(client: ClientBase, runtime: IAgentRuntime) {
         this.client = client;
         this.scraper = client.twitterClient;
+        this.runtime = runtime;
 
         const charSpaces = runtime.character.twitterSpaces || {};
         this.decisionOptions = {
@@ -174,8 +159,6 @@ export class TwitterSpaceClient {
                 runtime.character.settings.voice.model ||
                 "Xb7hH8MSUJpSbSDYk0k2",
             sttLanguage: charSpaces.sttLanguage || "en",
-            gptModel: charSpaces.gptModel,
-            systemPrompt: charSpaces.systemPrompt,
             speakerMaxDurationMs: charSpaces.speakerMaxDurationMs ?? 4 * 60_000,
         };
     }
@@ -307,9 +290,11 @@ export class TwitterSpaceClient {
             this.speakerQueue = [];
 
             // Retrieve keys
-            const openAiKey = process.env.OPENAI_API_KEY || "";
-            const elevenLabsKey = process.env.ELEVENLABS_XI_API_KEY || "";
+            const elevenLabsKey =
+                this.runtime.getSetting("ELEVENLABS_XI_API_KEY") || "";
 
+            const broadcastInfo = await this.currentSpace.initialize(config);
+            this.spaceId = broadcastInfo.room_id;
             // Plugins
             if (this.decisionOptions.enableRecording) {
                 elizaLogger.log("[Space] Using RecordToDiskPlugin");
@@ -321,11 +306,11 @@ export class TwitterSpaceClient {
                 const sttTts = new SttTtsPlugin();
                 this.sttTtsPlugin = sttTts;
                 this.currentSpace.use(sttTts, {
-                    openAiApiKey: openAiKey,
+                    runtime: this.runtime,
+                    client: this.client,
+                    spaceId: this.spaceId,
                     elevenLabsApiKey: elevenLabsKey,
                     voiceId: this.decisionOptions.voiceId,
-                    gptModel: this.decisionOptions.gptModel,
-                    systemPrompt: this.decisionOptions.systemPrompt,
                     sttLanguage: this.decisionOptions.sttLanguage,
                     transcriptionService:
                         this.client.runtime.getService<ITranscriptionService>(
@@ -344,8 +329,6 @@ export class TwitterSpaceClient {
                 );
             }
 
-            const broadcastInfo = await this.currentSpace.initialize(config);
-            this.spaceId = broadcastInfo.room_id;
             this.isSpaceRunning = true;
             await this.scraper.sendTweet(
                 broadcastInfo.share_url.replace("broadcasts", "spaces")
diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts
index a53f44591af..3b4018ad42b 100644
--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@@ -740,6 +740,7 @@ export type Character = {
         twitterPostTemplate?: TemplateType;
         twitterMessageHandlerTemplate?: TemplateType;
         twitterShouldRespondTemplate?: TemplateType;
+        twitterVoiceHandlerTemplate?: TemplateType;
         instagramPostTemplate?: TemplateType;
         instagramMessageHandlerTemplate?: TemplateType;
         instagramShouldRespondTemplate?: TemplateType;
@@ -918,8 +919,26 @@ export type Character = {
 
     /**Optinal Parent characters to inherit information from */
     extends?: string[];
+
+    twitterSpaces?: TwitterSpaceDecisionOptions;
 };
 
+export interface TwitterSpaceDecisionOptions {
+    maxSpeakers?: number;
+    topics?: string[];
+    typicalDurationMinutes?: number;
+    idleKickTimeoutMs?: number;
+    minIntervalBetweenSpacesMinutes?: number;
+    businessHoursOnly?: boolean;
+    randomChance?: number;
+    enableIdleMonitor?: boolean;
+    enableSttTts?: boolean;
+    enableRecording?: boolean;
+    voiceId?: string;
+    sttLanguage?: string;
+    speakerMaxDurationMs?: number;
+}
+
 /**
  * Interface for database operations
  */
@@ -1622,4 +1641,3 @@ export interface ChunkRow {
     id: string;
     // Add other properties if needed
 }
-