From 4d1a87ffad0bbe2dae5f3604434ec13510e8e378 Mon Sep 17 00:00:00 2001 From: Luke Oliff Date: Mon, 11 Nov 2024 17:03:00 +0000 Subject: [PATCH] feat: upgrade demo from STT/LLM/TTS to Agent API (#56) * feat: added anonymous session to secure serverless endpoints (#54) * feat: upgrade from stt-llm-tts to agent-api (#57) * feat: added anonymous session to secure serverless endpoints * feat: integrated anonymous session in client for specific route * feat: added full authentication and hold some code for a dirty error * fix: recommit fixed package-lock.json and added authentication * chore: removed all redundent code and settings message --------- Co-authored-by: TechAlchmy Co-authored-by: TechAlchemist <114001662+TechAlchmy@users.noreply.github.com> Co-authored-by: lukeocodes * fix: style tweak * feat: add banner to click back to v1 --------- Co-authored-by: TechAlchemist <114001662+TechAlchmy@users.noreply.github.com> Co-authored-by: TechAlchemy423 BREAKING CHANGE: switching APIs from STT/LLM/TTS to Agent API changes the entire purpose of this demo --- app/api/authenticate/route.ts | 12 +- app/api/brain/route.ts | 36 - app/api/generate-token/route.ts | 9 + app/api/speak/route.ts | 61 - app/api/verify-token/route.ts | 11 + app/components/AgentAvatar.tsx | 30 - app/components/AgentControls.tsx | 79 + app/components/ChatBubble.tsx | 60 +- app/components/Controls.tsx | 134 - app/components/Conversation.tsx | 490 - app/components/ConversationAgent.tsx | 79 + app/components/Download.tsx | 49 - .../{InitialLoad.tsx => InitialLoadAgent.tsx} | 29 +- app/components/LeftBubble.tsx | 39 - app/components/MessageAudio.tsx | 153 +- app/components/MessageHeader.tsx | 24 +- app/components/MessageMeta.tsx | 89 - app/components/RightBubble.tsx | 2 - app/components/Settings.tsx | 174 - app/config/index.ts | 11 + app/context/AudioStore.tsx | 47 - app/context/Auth.tsx | 85 + app/context/Deepgram.tsx | 251 - app/context/MessageMetadata.tsx | 51 - app/context/Microphone.tsx | 134 - app/context/Toast.tsx | 6 +- app/context/WebSocketContext.tsx | 529 ++ app/layout.tsx | 22 +- app/lib/authMiddleware.ts | 21 + app/lib/constants.ts | 134 - app/lib/helpers.ts | 116 +- app/lib/hooks/useLocalStorage.ts | 24 - app/lib/hooks/useSubmit.tsx | 25 - app/lib/jwt.ts | 18 + app/page.tsx | 15 +- app/worker.ts | 42 - next.config.js | 32 - package-lock.json | 8111 +++++++++-------- package.json | 15 +- sample.env.local | 6 +- tailwind.config.ts | 1 + 41 files changed, 5337 insertions(+), 5919 deletions(-) delete mode 100644 app/api/brain/route.ts create mode 100644 app/api/generate-token/route.ts delete mode 100644 app/api/speak/route.ts create mode 100644 app/api/verify-token/route.ts delete mode 100644 app/components/AgentAvatar.tsx create mode 100644 app/components/AgentControls.tsx delete mode 100644 app/components/Controls.tsx delete mode 100644 app/components/Conversation.tsx create mode 100644 app/components/ConversationAgent.tsx delete mode 100644 app/components/Download.tsx rename app/components/{InitialLoad.tsx => InitialLoadAgent.tsx} (70%) delete mode 100644 app/components/LeftBubble.tsx delete mode 100644 app/components/MessageMeta.tsx delete mode 100644 app/components/Settings.tsx create mode 100644 app/config/index.ts delete mode 100644 app/context/AudioStore.tsx create mode 100644 app/context/Auth.tsx delete mode 100644 app/context/Deepgram.tsx delete mode 100644 app/context/MessageMetadata.tsx delete mode 100644 app/context/Microphone.tsx create mode 100644 app/context/WebSocketContext.tsx create mode 100644 app/lib/authMiddleware.ts delete mode 100644 app/lib/constants.ts delete mode 100644 app/lib/hooks/useLocalStorage.ts delete mode 100644 app/lib/hooks/useSubmit.tsx create mode 100644 app/lib/jwt.ts delete mode 100644 app/worker.ts diff --git a/app/api/authenticate/route.ts b/app/api/authenticate/route.ts index edda1f8f..7bbdc944 100644 --- a/app/api/authenticate/route.ts +++ b/app/api/authenticate/route.ts @@ -1,19 +1,27 @@ +import { config } from "@/app/config"; +import { verifyJWT } from "@/app/lib/authMiddleware"; import { DeepgramError, createClient } from "@deepgram/sdk"; import { NextResponse, type NextRequest } from "next/server"; export const revalidate = 0; export async function GET(request: NextRequest) { + // verify jwt token + const authResponse = verifyJWT(request); + if (authResponse.status !== 200) { + return authResponse; + } + // exit early so we don't request 70000000 keys while in devmode if (process.env.DEEPGRAM_ENV === "development") { return NextResponse.json({ - key: process.env.DEEPGRAM_API_KEY ?? "", + key: config.deepGramApiKey ?? "", }); } // gotta use the request object to invalidate the cache every request :vomit: const url = request.url; - const deepgram = createClient(process.env.DEEPGRAM_API_KEY ?? ""); + const deepgram = createClient(config.deepGramApiKey ?? ""); let { result: projectsResult, error: projectsError } = await deepgram.manage.getProjects(); diff --git a/app/api/brain/route.ts b/app/api/brain/route.ts deleted file mode 100644 index 86d1ebae..00000000 --- a/app/api/brain/route.ts +++ /dev/null @@ -1,36 +0,0 @@ -import OpenAI from "openai"; -import { OpenAIStream, StreamingTextResponse } from "ai"; - -// Optional, but recommended: run on the edge runtime. -// See https://vercel.com/docs/concepts/functions/edge-functions -export const runtime = "edge"; - -const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY!, -}); - -export async function POST(req: Request) { - // Extract the `messages` from the body of the request - const { messages } = await req.json(); - const start = Date.now(); - - // Request the OpenAI API for the response based on the prompt - try { - const response = await openai.chat.completions.create({ - model: "gpt-3.5-turbo-0125", - stream: true, - messages: messages, - }); - - const stream = OpenAIStream(response); - - return new StreamingTextResponse(stream, { - headers: { - "X-LLM-Start": `${start}`, - "X-LLM-Response": `${Date.now()}`, - }, - }); - } catch (error) { - console.error("test", error); - } -} diff --git a/app/api/generate-token/route.ts b/app/api/generate-token/route.ts new file mode 100644 index 00000000..1d3f68e5 --- /dev/null +++ b/app/api/generate-token/route.ts @@ -0,0 +1,9 @@ +import { generateToken } from "@/app/lib/jwt"; +import { NextResponse } from "next/server"; + +export async function POST() { + const payload = { role: "anonymous" + Date.now() }; + + const token = generateToken(payload); + return NextResponse.json({ token }); +} diff --git a/app/api/speak/route.ts b/app/api/speak/route.ts deleted file mode 100644 index 0a74336d..00000000 --- a/app/api/speak/route.ts +++ /dev/null @@ -1,61 +0,0 @@ -import { Message } from "ai"; -import { NextRequest, NextResponse } from "next/server"; - -/** - * Return a stream from the API - * @param {NextRequest} req - The HTTP request - * @returns {Promise} A NextResponse with the streamable response - */ -export async function POST(req: NextRequest) { - // gotta use the request object to invalidate the cache every request :vomit: - const url = req.url; - const model = req.nextUrl.searchParams.get("model") ?? "aura-asteria-en"; - const message: Message = await req.json(); - const start = Date.now(); - - let text = message.content; - - text = text - .replaceAll("ยก", "") - .replaceAll("https://", "") - .replaceAll("http://", "") - .replaceAll(".com", " dot com") - .replaceAll(".org", " dot org") - .replaceAll(".co.uk", " dot co dot UK") - .replaceAll(/```[\s\S]*?```/g, "\nAs shown on the app.\n") - .replaceAll( - /([a-zA-Z0-9])\/([a-zA-Z0-9])/g, - (match, precedingText, followingText) => { - return precedingText + " forward slash " + followingText; - } - ); - - return await fetch( - `${process.env.DEEPGRAM_STT_DOMAIN}/v1/speak?model=${model}`, - { - method: "POST", - body: JSON.stringify({ text }), - headers: { - "Content-Type": `application/json`, - Authorization: `token ${process.env.DEEPGRAM_API_KEY || ""}`, - "X-DG-Referrer": url, - }, - } - ) - .then(async (response) => { - const headers = new Headers(); - headers.set("X-DG-Latency", `${Date.now() - start}`); - headers.set("Content-Type", "audio/mp3"); - - if (!response?.body) { - return new NextResponse("Unable to get response from API.", { - status: 500, - }); - } - - return new NextResponse(response.body, { headers }); - }) - .catch((error: any) => { - return new NextResponse(error || error?.message, { status: 500 }); - }); -} diff --git a/app/api/verify-token/route.ts b/app/api/verify-token/route.ts new file mode 100644 index 00000000..2359a31f --- /dev/null +++ b/app/api/verify-token/route.ts @@ -0,0 +1,11 @@ +import { verifyJWT } from "@/app/lib/authMiddleware"; +import { NextRequest } from "next/server"; + +export async function GET(req: NextRequest) { + // verify jwt token + const authResponse = verifyJWT(req); + verifyJWT; + if (authResponse.status !== 200) { + return authResponse; + } +} diff --git a/app/components/AgentAvatar.tsx b/app/components/AgentAvatar.tsx deleted file mode 100644 index 54b118c8..00000000 --- a/app/components/AgentAvatar.tsx +++ /dev/null @@ -1,30 +0,0 @@ -import { Avatar } from "@nextui-org/react"; -import { DgSvg } from "./DgSvg"; -import { Message } from "ai/react"; -import { useMessageData } from "../context/MessageMetadata"; -import { useAudioStore } from "../context/AudioStore"; -import { voiceMap } from "../context/Deepgram"; - -export const AgentAvatar = ({ - message, - className = "", -}: { - message: Message; - className?: string; -}) => { - const { audioStore } = useAudioStore(); - const { messageData } = useMessageData(); - - const foundAudio = audioStore.findLast((item) => item.id === message.id); - const foundData = messageData.findLast((item) => item.id === message.id); - - if (foundAudio?.model) { - return ; - } - - if (foundData?.ttsModel) { - return ; - } - - return ; -}; diff --git a/app/components/AgentControls.tsx b/app/components/AgentControls.tsx new file mode 100644 index 00000000..a4ff8bbf --- /dev/null +++ b/app/components/AgentControls.tsx @@ -0,0 +1,79 @@ +import { Tooltip } from "@nextui-org/react"; +import { useCallback } from "react"; +import { MicrophoneIcon } from "./icons/MicrophoneIcon"; +import { useWebSocketContext } from "../context/WebSocketContext"; + +export const AgentSettings = () => { + return ( + <> +
+ + LLM: Claude 3 Haiku + + + Voice: Asteria + +
+ + ); +}; + +export const AgentControls = () => { + const { startStreaming, stopStreaming, microphoneOpen } = + useWebSocketContext(); + + const microphoneToggle = useCallback( + async (e: Event) => { + e.preventDefault(); + console.log("toogle the control"); + if (!microphoneOpen) { + startStreaming(); + } else { + stopStreaming(); + } + }, + [microphoneOpen, startStreaming, stopStreaming] + ); + + console.log("microphone control rendering"); + + return ( + + ); +}; diff --git a/app/components/ChatBubble.tsx b/app/components/ChatBubble.tsx index 083b12ea..85973d9d 100644 --- a/app/components/ChatBubble.tsx +++ b/app/components/ChatBubble.tsx @@ -1,28 +1,46 @@ -import { LeftBubble } from "./LeftBubble"; -import { Message } from "ai"; import { RightBubble } from "./RightBubble"; +import { DgSvg } from "./DgSvg"; +import { TextContent } from "./TextContext"; +import { AgentMessageHeader } from "./MessageHeader"; +import { AgentMessageAudio } from "./MessageAudio"; +import { Avatar } from "@nextui-org/react"; -// const isMessage = (message: Message | Metadata): message is Message { -// return typeof message === 'Message'; -// } - -function isUserMessage(message: any): message is Message { - return message.role === "user"; -} - -function isAssistantMessage(message: any): message is Message { - return message.role === "assistant"; -} - -export const ChatBubble = ({ message }: { message: any }) => { - if (isUserMessage(message)) { +export const AgentChatBubble = ({ message }: { message: any }) => { + if (message?.role === "user") { // chat user return ; - } else if (isAssistantMessage(message)) { - // chat assistant - return ; } else { - // other as-yet unlabelled messages - return <>; + // chat assistant + return ( + <> +
+
+
+
+ {message?.voice ? ( + + ) : ( + + )} +
+
+
+ +
+ +
+
+
+
+
+
+ +
+
+
+
+
+ + ); } }; diff --git a/app/components/Controls.tsx b/app/components/Controls.tsx deleted file mode 100644 index 3dc5d2a1..00000000 --- a/app/components/Controls.tsx +++ /dev/null @@ -1,134 +0,0 @@ -import { Message } from "ai/react"; -import { Tooltip } from "@nextui-org/react"; -import { useCallback, useEffect } from "react"; - -import { Download } from "./Download"; -import { MicrophoneIcon } from "./icons/MicrophoneIcon"; -import { SendIcon } from "./icons/SendIcon"; -import { Settings } from "./Settings"; -import { useMicrophone } from "../context/Microphone"; -import { useNowPlaying } from "react-nowplaying"; -import { useSubmit } from "../lib/hooks/useSubmit"; - -// Better to use library, a lot of complexity is involved -// in building the resizable input -import TextareaAutosize from 'react-textarea-autosize'; - - -export const Controls = ({ - input, - handleSubmit, - handleInputChange, - messages, -}: { - input: string; - handleSubmit: any; - handleInputChange: any; - messages: Message[]; -}) => { - const { startMicrophone, stopMicrophone, microphoneOpen } = useMicrophone(); - const { formRef, onKeyDown } = useSubmit() - - useEffect(() => { - startMicrophone(); - // eslint-disable-next-line react-hooks/exhaustive-deps - }, []) - - const microphoneToggle = useCallback( - async (e: Event) => { - e.preventDefault(); - - if (microphoneOpen) { - stopMicrophone(); - } else { - startMicrophone(); - } - }, - [microphoneOpen, startMicrophone, stopMicrophone] - ); - - const { stop: stopAudio } = useNowPlaying(); - - const submitter = useCallback( - (e: any) => { - handleSubmit(e); - stopAudio(); - e.target.value = ''; - handleInputChange(e) - }, - // eslint-disable-next-line react-hooks/exhaustive-deps - [stopAudio, handleSubmit] - ); - - return ( -
- -
- ); -}; diff --git a/app/components/Conversation.tsx b/app/components/Conversation.tsx deleted file mode 100644 index 66c90bd5..00000000 --- a/app/components/Conversation.tsx +++ /dev/null @@ -1,490 +0,0 @@ -"use client"; - -import { - LiveClient, - LiveConnectionState, - LiveTranscriptionEvent, - LiveTranscriptionEvents, -} from "@deepgram/sdk"; -import { Message, useChat } from "ai/react"; -import { NextUIProvider } from "@nextui-org/react"; -import { useMicVAD } from "@ricky0123/vad-react"; -import { useNowPlaying } from "react-nowplaying"; -import { useQueue } from "@uidotdev/usehooks"; -import { useState, useEffect, useCallback, useRef, useMemo } from "react"; - -import { ChatBubble } from "./ChatBubble"; -import { - contextualGreeting, - generateRandomString, - utteranceText, -} from "../lib/helpers"; -import { Controls } from "./Controls"; -import { InitialLoad } from "./InitialLoad"; -import { MessageMetadata } from "../lib/types"; -import { RightBubble } from "./RightBubble"; -import { systemContent } from "../lib/constants"; -import { useDeepgram } from "../context/Deepgram"; -import { useMessageData } from "../context/MessageMetadata"; -import { useMicrophone } from "../context/Microphone"; -import { useAudioStore } from "../context/AudioStore"; - -/** - * Conversation element that contains the conversational AI app. - * @returns {JSX.Element} - */ -export default function Conversation(): JSX.Element { - /** - * Custom context providers - */ - const { ttsOptions, connection, connectionReady } = useDeepgram(); - const { addAudio } = useAudioStore(); - const { player, stop: stopAudio, play: startAudio } = useNowPlaying(); - const { addMessageData } = useMessageData(); - const { - microphoneOpen, - queue: microphoneQueue, - queueSize: microphoneQueueSize, - firstBlob, - removeBlob, - stream, - } = useMicrophone(); - - /** - * Queues - */ - const { - add: addTranscriptPart, - queue: transcriptParts, - clear: clearTranscriptParts, - } = useQueue<{ is_final: boolean; speech_final: boolean; text: string }>([]); - - /** - * Refs - */ - const messageMarker = useRef(null); - - /** - * State - */ - const [initialLoad, setInitialLoad] = useState(true); - const [isProcessing, setProcessing] = useState(false); - - /** - * Request audio from API - */ - const requestTtsAudio = useCallback( - async (message: Message) => { - const start = Date.now(); - const model = ttsOptions?.model ?? "aura-asteria-en"; - - const res = await fetch(`/api/speak?model=${model}`, { - cache: "no-store", - method: "POST", - body: JSON.stringify(message), - }); - - const headers = res.headers; - - const blob = await res.blob(); - - startAudio(blob, "audio/mp3", message.id).then(() => { - addAudio({ - id: message.id, - blob, - latency: Number(headers.get("X-DG-Latency")) ?? Date.now() - start, - networkLatency: Date.now() - start, - model, - }); - }); - }, - // eslint-disable-next-line react-hooks/exhaustive-deps - [ttsOptions?.model] - ); - - const [llmNewLatency, setLlmNewLatency] = useState<{ - start: number; - response: number; - }>(); - - const onFinish = useCallback( - (msg: any) => { - requestTtsAudio(msg); - }, - [requestTtsAudio] - ); - - const onResponse = useCallback((res: Response) => { - (async () => { - setLlmNewLatency({ - start: Number(res.headers.get("x-llm-start")), - response: Number(res.headers.get("x-llm-response")), - }); - })(); - }, []); - - const systemMessage: Message = useMemo( - () => ({ - id: generateRandomString(7), - role: "system", - content: systemContent, - }), - [] - ); - - const greetingMessage: Message = useMemo( - () => ({ - id: generateRandomString(7), - role: "assistant", - content: contextualGreeting(), - }), - [] - ); - - /** - * AI SDK - */ - const { - messages: chatMessages, - append, - handleInputChange, - input, - handleSubmit, - isLoading: llmLoading, - } = useChat({ - id: "aura", - api: "/api/brain", - initialMessages: [systemMessage, greetingMessage], - onFinish, - onResponse, - }); - - const [currentUtterance, setCurrentUtterance] = useState(); - const [failsafeTimeout, setFailsafeTimeout] = useState(); - const [failsafeTriggered, setFailsafeTriggered] = useState(false); - - const onSpeechEnd = useCallback(() => { - /** - * We have the audio data context available in VAD - * even before we start sending it to deepgram. - * So ignore any VAD events before we "open" the mic. - */ - if (!microphoneOpen) return; - - setFailsafeTimeout( - setTimeout(() => { - if (currentUtterance) { - console.log("failsafe fires! pew pew!!"); - setFailsafeTriggered(true); - append({ - role: "user", - content: currentUtterance, - }); - clearTranscriptParts(); - setCurrentUtterance(undefined); - } - }, 1500) - ); - - return () => { - clearTimeout(failsafeTimeout); - }; - - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [microphoneOpen, currentUtterance]); - - const onSpeechStart = () => { - /** - * We have the audio data context available in VAD - * even before we start sending it to deepgram. - * So ignore any VAD events before we "open" the mic. - */ - if (!microphoneOpen) return; - - /** - * We we're talking again, we want to wait for a transcript. - */ - setFailsafeTriggered(false); - - if (!player?.ended) { - stopAudio(); - console.log("barging in! SHH!"); - } - }; - - useMicVAD({ - startOnLoad: true, - stream, - onSpeechStart, - onSpeechEnd, - positiveSpeechThreshold: 0.6, - negativeSpeechThreshold: 0.6 - 0.15, - }); - - useEffect(() => { - if (llmLoading) return; - if (!llmNewLatency) return; - - const latestLlmMessage: MessageMetadata = { - ...chatMessages[chatMessages.length - 1], - ...llmNewLatency, - end: Date.now(), - ttsModel: ttsOptions?.model, - }; - - addMessageData(latestLlmMessage); - }, [ - chatMessages, - llmNewLatency, - setLlmNewLatency, - llmLoading, - addMessageData, - ttsOptions?.model, - ]); - - /** - * Contextual functions - */ - const requestWelcomeAudio = useCallback(async () => { - requestTtsAudio(greetingMessage); - }, [greetingMessage, requestTtsAudio]); - - const startConversation = useCallback(() => { - if (!initialLoad) return; - - setInitialLoad(false); - - // add a stub message data with no latency - const welcomeMetadata: MessageMetadata = { - ...greetingMessage, - ttsModel: ttsOptions?.model, - }; - - addMessageData(welcomeMetadata); - - // get welcome audio - requestWelcomeAudio(); - }, [ - addMessageData, - greetingMessage, - initialLoad, - requestWelcomeAudio, - ttsOptions?.model, - ]); - - useEffect(() => { - const onTranscript = (data: LiveTranscriptionEvent) => { - let content = utteranceText(data); - - // i only want an empty transcript part if it is speech_final - if (content !== "" || data.speech_final) { - /** - * use an outbound message queue to build up the unsent utterance - */ - addTranscriptPart({ - is_final: data.is_final as boolean, - speech_final: data.speech_final as boolean, - text: content, - }); - } - }; - - const onOpen = (connection: LiveClient) => { - connection.addListener(LiveTranscriptionEvents.Transcript, onTranscript); - }; - - if (connection) { - connection.addListener(LiveTranscriptionEvents.Open, onOpen); - } - - return () => { - connection?.removeListener(LiveTranscriptionEvents.Open, onOpen); - connection?.removeListener( - LiveTranscriptionEvents.Transcript, - onTranscript - ); - }; - }, [addTranscriptPart, connection]); - - const getCurrentUtterance = useCallback(() => { - return transcriptParts.filter(({ is_final, speech_final }, i, arr) => { - return is_final || speech_final || (!is_final && i === arr.length - 1); - }); - }, [transcriptParts]); - - const [lastUtterance, setLastUtterance] = useState(); - - useEffect(() => { - const parts = getCurrentUtterance(); - const last = parts[parts.length - 1]; - const content = parts - .map(({ text }) => text) - .join(" ") - .trim(); - - /** - * if the entire utterance is empty, don't go any further - * for example, many many many empty transcription responses - */ - if (!content) return; - - /** - * failsafe was triggered since we last sent a message to TTS - */ - if (failsafeTriggered) { - clearTranscriptParts(); - setCurrentUtterance(undefined); - return; - } - - /** - * display the concatenated utterances - */ - setCurrentUtterance(content); - - /** - * record the last time we recieved a word - */ - if (last.text !== "") { - setLastUtterance(Date.now()); - } - - /** - * if the last part of the utterance, empty or not, is speech_final, send to the LLM. - */ - if (last && last.speech_final) { - clearTimeout(failsafeTimeout); - append({ - role: "user", - content, - }); - clearTranscriptParts(); - setCurrentUtterance(undefined); - } - }, [ - getCurrentUtterance, - clearTranscriptParts, - append, - failsafeTimeout, - failsafeTriggered, - ]); - - /** - * magic microphone audio queue processing - */ - useEffect(() => { - const processQueue = async () => { - if (microphoneQueueSize > 0 && !isProcessing) { - setProcessing(true); - - if (connectionReady) { - const nextBlob = firstBlob; - - if (nextBlob && nextBlob?.size > 0) { - connection?.send(nextBlob); - } - - removeBlob(); - } - - const waiting = setTimeout(() => { - clearTimeout(waiting); - setProcessing(false); - }, 200); - } - }; - - processQueue(); - }, [ - connection, - microphoneQueue, - removeBlob, - firstBlob, - microphoneQueueSize, - isProcessing, - connectionReady, - ]); - - /** - * keep deepgram connection alive when mic closed - */ - useEffect(() => { - let keepAlive: any; - if (connection && connectionReady && !microphoneOpen) { - keepAlive = setInterval(() => { - // should stop spamming dev console when working on frontend in devmode - if (connection?.getReadyState() !== LiveConnectionState.OPEN) { - clearInterval(keepAlive); - } else { - connection.keepAlive(); - } - }, 10000); - } else { - clearInterval(keepAlive); - } - - // prevent duplicate timeouts - return () => { - clearInterval(keepAlive); - }; - }, [connection, connectionReady, microphoneOpen]); - - // this works - useEffect(() => { - if (messageMarker.current) { - messageMarker.current.scrollIntoView({ - behavior: "auto", - }); - } - }, [chatMessages]); - - return ( - <> - -
-
-
-
-
-
- {initialLoad ? ( - - ) : ( - <> - {chatMessages.length > 0 && - chatMessages.map((message, i) => ( - - ))} - - {currentUtterance && ( - - )} - -
- - )} -
-
- {!initialLoad && ( - - )} -
-
-
-
-
- - ); -} diff --git a/app/components/ConversationAgent.tsx b/app/components/ConversationAgent.tsx new file mode 100644 index 00000000..64fbeaf4 --- /dev/null +++ b/app/components/ConversationAgent.tsx @@ -0,0 +1,79 @@ +"use client"; +import { useState, useEffect, useRef } from "react"; +import { useWebSocketContext } from "../context/WebSocketContext"; +import { AgentControls } from "./AgentControls"; +import { InitialLoadAgent } from "./InitialLoadAgent"; +import { NextUIProvider } from "@nextui-org/react"; +import { AgentChatBubble } from "./ChatBubble"; + +/** + * Conversation element that contains the conversational AI app. + * @returns {JSX.Element} + */ +export default function ConversationAgent(): JSX.Element { + const { startStreaming, chatMessages, connection } = useWebSocketContext(); + /** + * Refs + */ + const messageMarker = useRef(null); + + /** + * State + */ + const [initialLoad, setInitialLoad] = useState(true); + + const startConversation = () => { + startStreaming(); + setInitialLoad(false); + }; + + // this works + useEffect(() => { + if (messageMarker.current) { + messageMarker.current.scrollIntoView({ + behavior: "auto", + }); + } + }, [chatMessages]); + + return ( + <> + +
+
+
+
+
+
+ {initialLoad ? ( + + ) : ( + <> + {chatMessages.length > 0 && + chatMessages.map((message, i) => ( + + ))} +
+ + )} +
+
+ {!initialLoad && } +
+
+
+
+
+ + ); +} diff --git a/app/components/Download.tsx b/app/components/Download.tsx deleted file mode 100644 index f9fe92c0..00000000 --- a/app/components/Download.tsx +++ /dev/null @@ -1,49 +0,0 @@ -import { Message } from "ai/react"; -import { DownloadIcon } from "./icons/DownloadIcon"; -import { voiceMap } from "../context/Deepgram"; -import { useAudioStore } from "../context/AudioStore"; - -const DownloadButton = ({ content }: { content: string }) => { - const file = new Blob([content], { type: "text/plain" }); - - return ( - - - - Download transcript - - - ); -}; - -export const Download = ({ messages }: { messages: Message[] }) => { - const { audioStore } = useAudioStore(); - const context = messages - .filter((m) => ["user", "assistant"].includes(m.role)) - .map((m) => { - if (m.role === "assistant") { - const foundAudio = audioStore.findLast((item) => item.id === m.id); - const voice = foundAudio?.model - ? voiceMap(foundAudio?.model).name - : "Deepgram"; - - return `${voice ?? "Asteria"}: ${m.content}`; - } - - if (m.role === "user") { - return `User: ${m.content}`; - } - }); - - return ( -
- -
- ); -}; diff --git a/app/components/InitialLoad.tsx b/app/components/InitialLoadAgent.tsx similarity index 70% rename from app/components/InitialLoad.tsx rename to app/components/InitialLoadAgent.tsx index 57e93e72..90ec095a 100644 --- a/app/components/InitialLoad.tsx +++ b/app/components/InitialLoadAgent.tsx @@ -2,7 +2,13 @@ import { Headphones } from "./Headphones"; import { isBrowser } from "react-device-detect"; import { Spinner } from "@nextui-org/react"; -export const InitialLoad = ({ fn, connecting = true }: { fn: () => void, connecting: boolean }) => { +export const InitialLoadAgent = ({ + fn, + connecting = true, +}: { + fn: () => void; + connecting: boolean; +}) => { return ( <>
@@ -13,17 +19,13 @@ export const InitialLoad = ({ fn, connecting = true }: { fn: () => void, connect className="relative block w-full glass p-6 sm:p-8 lg:p-12 rounded-xl" >

- Welcome to Deepgram's -
- AI Agent Tech Demo. + Deepgram Agent API Demo

-
-
    -
  • Nova-2 Speech-to-Text
  • -
  • OpenAI GPT-3.5 Turbo
  • -
  • Aura Text-to-Speech
  • -
-
+ + For optimal enjoyment, we recommend using headphones + while using this application. Minor bugs and annoyances may appear + while using this demo. Pull requests are welcome. +
{connecting ? ( @@ -36,11 +38,6 @@ export const InitialLoad = ({ fn, connecting = true }: { fn: () => void, connect )}
- - For optimal enjoyment, we recommend using headphones - while using this application. Minor bugs and annoyances may appear - while using this demo. Pull requests are welcome. -
diff --git a/app/components/LeftBubble.tsx b/app/components/LeftBubble.tsx deleted file mode 100644 index 746f3caf..00000000 --- a/app/components/LeftBubble.tsx +++ /dev/null @@ -1,39 +0,0 @@ -import { AgentAvatar } from "./AgentAvatar"; -import { Message } from "ai/react"; -import { MessageAudio } from "./MessageAudio"; -import { MessageHeader } from "./MessageHeader"; -import { MessageMeta } from "./MessageMeta"; -import { TextContent } from "./TextContext"; - -export const LeftBubble = ({ message }: { message: Message }) => { - return ( - <> -
-
-
-
- -
-
-
- -
- -
-
-
-
-
-
- -
- -
-
-
-
- -
- - ); -}; diff --git a/app/components/MessageAudio.tsx b/app/components/MessageAudio.tsx index e220cf70..5ff8ea87 100644 --- a/app/components/MessageAudio.tsx +++ b/app/components/MessageAudio.tsx @@ -1,67 +1,111 @@ -import { Message } from "ai/react"; -import { Spinner } from "@nextui-org/react"; -import { useCallback, useEffect, useMemo, useState } from "react"; +import { useCallback, useEffect, useRef, useState } from "react"; -import { useAudioStore } from "../context/AudioStore"; -import { useNowPlaying } from "react-nowplaying"; - -const MessageAudio = ({ - message: { id }, +const AgentMessageAudio = ({ + message, className = "", ...rest }: { - message: Message; + message: { id: string; audio: ArrayBuffer }; className?: string; }) => { - const { audioStore } = useAudioStore(); - const { player, uid, resume: resumeAudio, play: playAudio } = useNowPlaying(); const [playing, setPlaying] = useState(false); + const audioContextRef = useRef(null); + const sourceNodeRef = useRef(null); - const found = useMemo(() => { - return audioStore.find((item) => item.id === id); - }, [audioStore, id]); + const replayAudio = useCallback((audioData: ArrayBuffer) => { + // Stop any existing playback + if (sourceNodeRef.current) { + sourceNodeRef.current.stop(); + sourceNodeRef.current.disconnect(); + } + if (audioContextRef.current) { + audioContextRef.current.close().catch(console.log); + } - useEffect(() => { - setPlaying(uid === id); - }, [uid, id]); + // Create a new AudioContext + const audioContext = new (window.AudioContext || + window.webkitAudioContext)(); + audioContextRef.current = audioContext; - const pause = useCallback(() => { - if (!player) return; + const audioDataView = new Int16Array(audioData); - player.pause(); - setPlaying(false); - }, [player]); + if (audioDataView.length === 0) { + console.error("Received audio data is empty."); + audioContext.close().catch(console.log); + return; + } - const play = useCallback(() => { - if (!player || !found) return; + const audioBuffer = audioContext.createBuffer( + 1, + audioDataView.length, + 48000 + ); + const audioBufferChannel = audioBuffer.getChannelData(0); - if (uid === found.id) { - resumeAudio(); - } else if (found) { - playAudio(found.blob, "audio/mp3", id); + for (let i = 0; i < audioDataView.length; i++) { + audioBufferChannel[i] = audioDataView[i] / 32768; // Convert linear16 PCM to float [-1, 1] } + const source = audioContext.createBufferSource(); + sourceNodeRef.current = source; + source.buffer = audioBuffer; + source.connect(audioContext.destination); + + source.onended = () => { + setPlaying(false); + source.disconnect(); + audioContext.close().catch((error) => { + console.error("Error closing AudioContext:", error); + }); + }; + + source.start(); setPlaying(true); - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [uid, found, id]); - - /** - * Spinner if still waiting for a response - */ - if (!found) { - return ; - } + }, []); + + const play = useCallback(() => { + if (message.audio) { + replayAudio(message.audio); + } + }, [message.audio, replayAudio]); + + const pause = useCallback(() => { + if (sourceNodeRef.current) { + sourceNodeRef.current.stop(); + sourceNodeRef.current.disconnect(); + } + if (audioContextRef.current) { + audioContextRef.current.close().catch((error) => { + console.error("Error closing AudioContext:", error); + }); + } + setPlaying(false); + }, []); + + useEffect(() => { + return () => { + // Clean up audio playback when component unmounts + if (sourceNodeRef.current) { + sourceNodeRef.current.stop(); + sourceNodeRef.current.disconnect(); + } + if (audioContextRef.current) { + audioContextRef.current.close().catch((error) => { + console.error("Error closing AudioContext:", error); + }); + } + }; + }, []); - /** - * Pause button - * - * audio === this message - * AND - * playing === true - */ if (playing) { return ( - pause!()}> + { + e.preventDefault(); + pause(); + }} + > play()}> + { + e.preventDefault(); + play(); + }} + > ; + return null; }; -export { MessageAudio }; +export { AgentMessageAudio }; diff --git a/app/components/MessageHeader.tsx b/app/components/MessageHeader.tsx index 049b7154..43f4c4d9 100644 --- a/app/components/MessageHeader.tsx +++ b/app/components/MessageHeader.tsx @@ -1,32 +1,16 @@ -import { Message } from "ai/react"; -import { useMessageData } from "../context/MessageMetadata"; -import { useAudioStore } from "../context/AudioStore"; -import { voiceMap } from "../context/Deepgram"; import moment from "moment"; -const MessageHeader = ({ +const AgentMessageHeader = ({ message, className = "", }: { - message: Message; + message: any; className?: string; }) => { - const { audioStore } = useAudioStore(); - const { messageData } = useMessageData(); - - const foundAudio = audioStore.findLast((item) => item.id === message.id); - const foundData = messageData.findLast((item) => item.id === message.id); - if (message.role === "assistant") { return (
- - {foundAudio?.model - ? voiceMap(foundAudio?.model).name - : foundData?.ttsModel - ? voiceMap(foundData?.ttsModel).name - : "Deepgram AI"} - + Asteria {moment().calendar()} @@ -35,4 +19,4 @@ const MessageHeader = ({ } }; -export { MessageHeader }; +export { AgentMessageHeader }; diff --git a/app/components/MessageMeta.tsx b/app/components/MessageMeta.tsx deleted file mode 100644 index 68f6857d..00000000 --- a/app/components/MessageMeta.tsx +++ /dev/null @@ -1,89 +0,0 @@ -import { Message } from "ai/react"; -import { useAudioStore } from "../context/AudioStore"; -import { useState } from "react"; -import { CaretIcon } from "./icons/CaretIcon"; -import { useMessageData } from "../context/MessageMetadata"; -import { Tooltip } from "@nextui-org/react"; -import { BoltIcon } from "./icons/BoltIcon"; - -const TTFB = () => ( - - Time to first-byte - -); - -const MessageMeta = ({ - message, - className = "", -}: { - message: Message; - className?: string; -}) => { - const { audioStore } = useAudioStore(); - const { messageData } = useMessageData(); - const [breakdown, setBreakdown] = useState(false); - - const foundData = messageData.findLast((item) => item.id === message.id); - const foundAudio = audioStore.findLast((item) => item.id === message.id); - - if (!foundAudio) return; - - if (message.role === "assistant") { - const llmTotal = Number(foundData?.end) - Number(foundData?.start); - const ttsTtfb = foundAudio.latency; - const ttsTotal = foundAudio.networkLatency; - - return ( -
-
- - - - - TTS : {(ttsTtfb / 1000).toFixed(1)}s - - - {!!llmTotal && ( - - LLM total: {(llmTotal / 1000).toFixed(1)}s - - )} - - TTS total: {(ttsTotal / 1000).toFixed(1)}s - -
-
- {!!llmTotal && ( - - LLM total: {(llmTotal / 1000).toFixed(1)}s - - )} - - - TTS total: {(ttsTotal / 1000).toFixed(1)}s - -
-
- ); - } -}; - -export { MessageMeta }; diff --git a/app/components/RightBubble.tsx b/app/components/RightBubble.tsx index 819d5b5f..f94f1812 100644 --- a/app/components/RightBubble.tsx +++ b/app/components/RightBubble.tsx @@ -1,5 +1,4 @@ import { Message } from "ai/react"; -import { MessageMeta } from "./MessageMeta"; import { TextContent } from "./TextContext"; import { UserAvatar } from "./UserAvatar"; @@ -25,7 +24,6 @@ export const RightBubble = ({
- ); diff --git a/app/components/Settings.tsx b/app/components/Settings.tsx deleted file mode 100644 index a2f7e86b..00000000 --- a/app/components/Settings.tsx +++ /dev/null @@ -1,174 +0,0 @@ -import { CogIcon } from "./icons/CogIcon"; -import { - Avatar, - Button, - Modal, - ModalBody, - ModalContent, - ModalFooter, - ModalHeader, - Select, - SelectItem, - useDisclosure, -} from "@nextui-org/react"; -import { useDeepgram, voiceMap, voices } from "../context/Deepgram"; -import { Dispatch, SetStateAction, useState } from "react"; -import { useToast } from "../context/Toast"; - -const arrayOfVoices = Object.entries(voices).map((e) => ({ - ...e[1], - model: e[0], -})); - -const ModelSelection = ({ - model, - setModel, -}: { - model: string; - setModel: Dispatch>; -}) => { - return ( - - ); -}; - -export const Settings = () => { - const { toast } = useToast(); - const { isOpen, onOpen, onOpenChange } = useDisclosure(); - const { ttsOptions, setTtsOptions } = useDeepgram(); - - const [model, setModel] = useState(ttsOptions?.model as string); - - return ( - <> -
- - - - Change settings - - - - Voice:{" "} - - {voiceMap(ttsOptions?.model as string).name} - - -
- - - {(onClose) => { - const saveAndClose = () => { - setTtsOptions({ ...ttsOptions, model }); - - toast("Options saved."); - - onClose(); - }; - - return ( - <> - - Settings - - -

Text-to-Speech Settings

- -
- - - - - ); - }} -
-
- - ); -}; - -// ; diff --git a/app/config/index.ts b/app/config/index.ts new file mode 100644 index 00000000..c80192d3 --- /dev/null +++ b/app/config/index.ts @@ -0,0 +1,11 @@ +const secret = process.env.JWT_SECRET || 'deepgram-secret'; +const deepGramUri = process.env.DEEPGRAM_STT_DOMAIN; +const deepGramApiKey = process.env.DEEPGRAM_API_KEY; +const deepGramEnv = process.env.DEEPGRAM_ENV; + +export const config = { + jwtSecret: secret, + deepGramUri, + deepGramApiKey, + deepGramEnv, +} \ No newline at end of file diff --git a/app/context/AudioStore.tsx b/app/context/AudioStore.tsx deleted file mode 100644 index f25f3213..00000000 --- a/app/context/AudioStore.tsx +++ /dev/null @@ -1,47 +0,0 @@ -"use client"; - -import { createContext, useCallback, useContext, useState } from "react"; - -type AudioStoreContext = { - audioStore: AudioPacket[]; - addAudio: (queueItem: AudioPacket) => void; -}; - -export interface AudioPacket { - id: string; - blob: Blob; - latency: number; - networkLatency: number; - model: string; -} - -interface AudioStoreItemContextInterface { - children: React.ReactNode; -} - -const AudioStoreContext = createContext({} as AudioStoreContext); - -export const AudioStoreContextProvider = ({ - children, -}: AudioStoreItemContextInterface) => { - const [audioStore, setAudioStore] = useState([]); - - const addAudio = useCallback((queueItem: AudioPacket): void => { - setAudioStore((q) => [...q, queueItem]); - }, []); - - return ( - - {children} - - ); -}; - -export function useAudioStore() { - return useContext(AudioStoreContext); -} diff --git a/app/context/Auth.tsx b/app/context/Auth.tsx new file mode 100644 index 00000000..e87cacc2 --- /dev/null +++ b/app/context/Auth.tsx @@ -0,0 +1,85 @@ +"use client"; +import { Spinner } from "@nextui-org/react"; +import React, { + createContext, + useContext, + useState, + useEffect, + ReactNode, + useCallback, +} from "react"; +import { toast } from "react-toastify"; +import cookie from "js-cookie"; + +interface AuthContextType { + token: string | null; + login: (token: string) => void; + logout: () => void; + handleLogin: () => void; + isAuthenticated: boolean; +} + +const AuthContext = createContext(undefined); + +export const AuthContextProvider = ({ children }: { children: ReactNode }) => { + const [token, setToken] = useState(null); + const [isLoading, setLoading] = useState(false); + + const login = (newToken: string) => { + cookie.set("dipjwtToken", newToken, { expires: 4 / 24 }); + setToken(newToken); + }; + + const handleLogin = useCallback(async () => { + // API route that generates the JWT token + const response = await fetch("/api/generate-token", { + method: "POST", + }); + const data = await response.json(); + login(data.token); + }, []); + + const logout = () => { + cookie.remove("dipjwtToken"); + setToken(null); + }; + + useEffect(() => { + // Load token from cookies when the app initializes + const loadToken = () => { + setLoading(true); + try { + const savedToken = cookie.get("dipjwtToken"); + // saved token verification + if (savedToken) { + setToken(savedToken); + } else { + handleLogin(); + } + } catch (err: any) { + logout(); + toast.error(err?.message); + } + setLoading(false); + }; + loadToken(); + }, [handleLogin]); + + const isAuthenticated = !!token; + + return ( + + {isLoading ? : children} + + ); +}; + +export const useAuth = (): AuthContextType => { + const context = useContext(AuthContext); + if (!context) { + throw new Error("useAuth must be used within an AuthProvider"); + } + return context; +}; diff --git a/app/context/Deepgram.tsx b/app/context/Deepgram.tsx deleted file mode 100644 index f46f4a9c..00000000 --- a/app/context/Deepgram.tsx +++ /dev/null @@ -1,251 +0,0 @@ -"use client"; - -import { - CreateProjectKeyResponse, - LiveClient, - LiveSchema, - LiveTranscriptionEvents, - SpeakSchema, -} from "@deepgram/sdk"; -import { - Dispatch, - SetStateAction, - createContext, - useCallback, - useContext, - useEffect, - useState, -} from "react"; -import { useToast } from "./Toast"; -import { useLocalStorage } from "../lib/hooks/useLocalStorage"; - -type DeepgramContext = { - ttsOptions: SpeakSchema | undefined; - setTtsOptions: (value: SpeakSchema) => void; - sttOptions: LiveSchema | undefined; - setSttOptions: (value: LiveSchema) => void; - connection: LiveClient | undefined; - connectionReady: boolean; -}; - -interface DeepgramContextInterface { - children: React.ReactNode; -} - -const DeepgramContext = createContext({} as DeepgramContext); - -const DEFAULT_TTS_MODEL = 'aura-asteria-en'; -const DEFAULT_STT_MODEL = 'nova-2'; - -const defaultTtsOptions = { - model: DEFAULT_TTS_MODEL -} - -const defaultSttsOptions = { - model: DEFAULT_STT_MODEL, - interim_results: true, - smart_format: true, - endpointing: 550, - utterance_end_ms: 1500, - filler_words: true, -} - -/** - * TTS Voice Options - */ -const voices: { - [key: string]: { - name: string; - avatar: string; - language: string; - accent: string; - }; -} = { - [DEFAULT_TTS_MODEL]: { - name: "Asteria", - avatar: "/aura-asteria-en.svg", - language: "English", - accent: "US", - }, - "aura-luna-en": { - name: "Luna", - avatar: "/aura-luna-en.svg", - language: "English", - accent: "US", - }, - "aura-stella-en": { - name: "Stella", - avatar: "/aura-stella-en.svg", - language: "English", - accent: "US", - }, - "aura-athena-en": { - name: "Athena", - avatar: "/aura-athena-en.svg", - language: "English", - accent: "UK", - }, - "aura-hera-en": { - name: "Hera", - avatar: "/aura-hera-en.svg", - language: "English", - accent: "US", - }, - "aura-orion-en": { - name: "Orion", - avatar: "/aura-orion-en.svg", - language: "English", - accent: "US", - }, - "aura-arcas-en": { - name: "Arcas", - avatar: "/aura-arcas-en.svg", - language: "English", - accent: "US", - }, - "aura-perseus-en": { - name: "Perseus", - avatar: "/aura-perseus-en.svg", - language: "English", - accent: "US", - }, - "aura-angus-en": { - name: "Angus", - avatar: "/aura-angus-en.svg", - language: "English", - accent: "Ireland", - }, - "aura-orpheus-en": { - name: "Orpheus", - avatar: "/aura-orpheus-en.svg", - language: "English", - accent: "US", - }, - "aura-helios-en": { - name: "Helios", - avatar: "/aura-helios-en.svg", - language: "English", - accent: "UK", - }, - "aura-zeus-en": { - name: "Zeus", - avatar: "/aura-zeus-en.svg", - language: "English", - accent: "US", - }, -}; - -const voiceMap = (model: string) => { - return voices[model]; -}; - -const getApiKey = async (): Promise => { - const result: CreateProjectKeyResponse = await ( - await fetch("/api/authenticate", { cache: "no-store" }) - ).json(); - - return result.key; -}; - -const DeepgramContextProvider = ({ children }: DeepgramContextInterface) => { - const { toast } = useToast(); - const [ttsOptions, setTtsOptions] = useLocalStorage('ttsModel'); - const [sttOptions, setSttOptions] = useLocalStorage('sttModel'); - const [connection, setConnection] = useState(); - const [connecting, setConnecting] = useState(false); - const [connectionReady, setConnectionReady] = useState(false); - - const connect = useCallback( - async (defaultSttsOptions: SpeakSchema) => { - if (!connection && !connecting) { - setConnecting(true); - - const connection = new LiveClient( - await getApiKey(), - {}, - defaultSttsOptions - ); - - setConnection(connection); - setConnecting(false); - } - // eslint-disable-next-line react-hooks/exhaustive-deps - }, - [connecting, connection] - ); - - useEffect(() => { - // it must be the first open of the page, let's set up the defaults - - // Why this is needed?, the requestTtsAudio of Conversation is wrapped in useCallback - // which has a dependency of ttsOptions model - // but the player inside the Nowplaying provider is set on mount, means - // the when the startAudio is called the player is undefined. - - // This can be fixed in 3 ways: - // 1. set player as a dependency inside the useCallback of requestTtsAudio - // 2. change the code of react-nowplaying to use the ref mechanism - // 3. follow the old code to avoid any risk i.e., first ttsOptions is undefined - // and later when it gets set, it also update the requestTtsAudio callback. - if (ttsOptions === undefined) { - setTtsOptions(defaultTtsOptions); - } - - if (!sttOptions === undefined) { - setSttOptions(defaultSttsOptions); - } - if (connection === undefined) { - connect(defaultSttsOptions); - } - }, [connect, connection, setSttOptions, setTtsOptions, sttOptions, ttsOptions]); - - useEffect(() => { - if (connection && connection?.getReadyState() !== undefined) { - connection.addListener(LiveTranscriptionEvents.Open, () => { - setConnectionReady(true); - }); - - connection.addListener(LiveTranscriptionEvents.Close, () => { - toast("The connection to Deepgram closed, we'll attempt to reconnect."); - setConnectionReady(false); - connection.removeAllListeners(); - setConnection(undefined); - }); - - connection.addListener(LiveTranscriptionEvents.Error, () => { - toast( - "An unknown error occured. We'll attempt to reconnect to Deepgram." - ); - setConnectionReady(false); - connection.removeAllListeners(); - setConnection(undefined); - }); - } - - return () => { - setConnectionReady(false); - connection?.removeAllListeners(); - }; - }, [connection, toast]); - - return ( - - {children} - - ); -}; - -function useDeepgram() { - return useContext(DeepgramContext); -} - -export { DeepgramContextProvider, useDeepgram, voiceMap, voices }; diff --git a/app/context/MessageMetadata.tsx b/app/context/MessageMetadata.tsx deleted file mode 100644 index efdfdbd0..00000000 --- a/app/context/MessageMetadata.tsx +++ /dev/null @@ -1,51 +0,0 @@ -"use client"; - -import { - Dispatch, - SetStateAction, - createContext, - useCallback, - useContext, - useState, -} from "react"; -import { MessageMetadata } from "../lib/types"; - -type MessageMetadataContext = { - messageData: MessageMetadata[]; - setMessageData: Dispatch>; - addMessageData: (queueItem: MessageMetadata) => void; -}; - -interface MessageMetadataContextInterface { - children: React.ReactNode; -} - -const MessageMetadataContext = createContext({} as MessageMetadataContext); - -const MessageMetadataContextProvider = ({ - children, -}: MessageMetadataContextInterface) => { - const [messageData, setMessageData] = useState([]); - - const addMessageData = useCallback((queueItem: MessageMetadata): void => { - setMessageData((q) => [...q, queueItem]); - }, []); - - return ( - - {children} - - ); -}; - -function useMessageData() { - return useContext(MessageMetadataContext); -} - -export { MessageMetadataContextProvider, useMessageData }; diff --git a/app/context/Microphone.tsx b/app/context/Microphone.tsx deleted file mode 100644 index ed73062f..00000000 --- a/app/context/Microphone.tsx +++ /dev/null @@ -1,134 +0,0 @@ -"use client"; - -import { useQueue } from "@uidotdev/usehooks"; -import { - Dispatch, - SetStateAction, - createContext, - useCallback, - useContext, - useEffect, - useState, -} from "react"; - -type MicrophoneContext = { - microphone: MediaRecorder | undefined; - setMicrophone: Dispatch>; - startMicrophone: () => void; - stopMicrophone: () => void; - microphoneOpen: boolean; - enqueueBlob: (element: Blob) => void; - removeBlob: () => Blob | undefined; - firstBlob: Blob | undefined; - queueSize: number; - queue: Blob[]; - stream: MediaStream | undefined; -}; - -interface MicrophoneContextInterface { - children: React.ReactNode; -} - -const MicrophoneContext = createContext({} as MicrophoneContext); - -const MicrophoneContextProvider = ({ - children, -}: MicrophoneContextInterface) => { - const [microphone, setMicrophone] = useState(); - const [stream, setStream] = useState(); - const [microphoneOpen, setMicrophoneOpen] = useState(false); - - const { - add: enqueueBlob, // addMicrophoneBlob, - remove: removeBlob, // removeMicrophoneBlob, - first: firstBlob, // firstMicrophoneBlob, - size: queueSize, // countBlobs, - queue, // : microphoneBlobs, - } = useQueue([]); - - useEffect(() => { - async function setupMicrophone() { - const stream = await navigator.mediaDevices.getUserMedia({ - audio: { - noiseSuppression: true, - echoCancellation: true, - }, - }); - - setStream(stream); - - const microphone = new MediaRecorder(stream); - - setMicrophone(microphone); - } - - if (!microphone) { - setupMicrophone(); - } - }, [enqueueBlob, microphone, microphoneOpen]); - - useEffect(() => { - if (!microphone) return; - - microphone.ondataavailable = (e) => { - if (microphoneOpen) enqueueBlob(e.data); - }; - - return () => { - microphone.ondataavailable = null; - }; - }, [enqueueBlob, microphone, microphoneOpen]); - - const stopMicrophone = useCallback(() => { - if (microphone?.state === "recording") microphone?.pause(); - - setMicrophoneOpen(false); - }, [microphone]); - - const startMicrophone = useCallback(() => { - if (microphone?.state === "paused") { - microphone?.resume(); - } else { - microphone?.start(250); - } - - setMicrophoneOpen(true); - }, [microphone]); - - useEffect(() => { - const eventer = () => - document.visibilityState !== "visible" && stopMicrophone(); - - window.addEventListener("visibilitychange", eventer); - - return () => { - window.removeEventListener("visibilitychange", eventer); - }; - }, [stopMicrophone]); - - return ( - - {children} - - ); -}; - -function useMicrophone() { - return useContext(MicrophoneContext); -} - -export { MicrophoneContextProvider, useMicrophone }; diff --git a/app/context/Toast.tsx b/app/context/Toast.tsx index e0972075..5c4eac81 100644 --- a/app/context/Toast.tsx +++ b/app/context/Toast.tsx @@ -1,7 +1,7 @@ "use client"; import { Bounce, ToastContainer, toast } from "react-toastify"; -import { createContext, useContext, useEffect } from "react"; +import { createContext, useContext } from "react"; import "react-toastify/dist/ReactToastify.css"; type ToastContext = { @@ -26,7 +26,9 @@ const ToastContextProvider = ({ children }: ToastContextInterface) => { limit={1} transition={Bounce} className={"md:w-96 mb-26"} - progressClassName={"bg-gradient-to-r bg-gradient to-[#13EF93]/50 from-[#149AFB]/80"} + progressClassName={ + "bg-gradient-to-r bg-gradient to-[#13EF93]/50 from-[#149AFB]/80" + } /> diff --git a/app/context/WebSocketContext.tsx b/app/context/WebSocketContext.tsx new file mode 100644 index 00000000..ef044729 --- /dev/null +++ b/app/context/WebSocketContext.tsx @@ -0,0 +1,529 @@ +"use client"; +import React, { + createContext, + ReactNode, + useContext, + useEffect, + useMemo, + useRef, + useState, + useCallback, +} from "react"; +import useWebSocket, { ReadyState } from "react-use-websocket"; +import { getApiKey } from "../lib/helpers"; +import { useAuth } from "./Auth"; + +// Types and Interfaces +type Message = { + content: string; + role: string; + audio?: ArrayBuffer; + voice?: string; + id: number | string; +}; + +type Speaker = "user" | "user-waiting" | "model" | null; + +interface WebSocketContextValue { + lastMessage: MessageEvent | null; + readyState: ReadyState; + connection: boolean; + voice: string; + model: string; + currentSpeaker: Speaker; + microphoneOpen: boolean; + chatMessages: Message[]; + sendMessage: (message: ArrayBuffer | string) => void; + startStreaming: () => Promise; + stopStreaming: () => void; + setVoice: (v: string) => void; + setModel: (v: string) => void; + replayAudio: (audioData: ArrayBuffer) => (() => void) | undefined; +} + +type WebSocketProviderProps = { children: ReactNode }; + +// Constants +const DEEPGRAM_SOCKET_URL = process.env + .NEXT_PUBLIC_DEEPGRAM_SOCKET_URL as string; +const PING_INTERVAL = 8000; // 8s + +// Context Creation +const WebSocketContext = createContext( + undefined +); + +// Utility functions +const concatArrayBuffers = (buffer1: ArrayBuffer, buffer2: ArrayBuffer) => { + const tmp = new Uint8Array(buffer1.byteLength + buffer2.byteLength); + tmp.set(new Uint8Array(buffer1), 0); + tmp.set(new Uint8Array(buffer2), buffer1.byteLength); + return tmp.buffer; +}; + +// WebSocket Provider Component +export const WebSocketProvider = ({ children }: WebSocketProviderProps) => { + const { token } = useAuth(); + // State + const [connection, setConnection] = useState(false); + const [voice, setVoice] = useState("aura-asteria-en"); + const [model, setModel] = useState("anthropic+claude-3-haiku-20240307"); + const [currentSpeaker, setCurrentSpeaker] = useState(null); + const [microphoneOpen, setMicrophoneOpen] = useState(true); + const [chatMessages, setChatMessages] = useState([]); + const [socketURL, setSocketUrl] = useState( + `${DEEPGRAM_SOCKET_URL}?t=${Date.now()}` + ); + const [startTime, setStartTime] = useState(0); + const [apiKey, setApiKey] = useState(null); + + // Refs + const audioContextRef = useRef(null); + const microphoneRef = useRef(null); + const processorRef = useRef(null); + const streamRef = useRef(null); + const pingIntervalRef = useRef(null); + const scheduledAudioSourcesRef = useRef([]); + const incomingMessage = useRef(null); + + // Config settings + const [configSettings, setConfigSettings] = useState({ + type: "SettingsConfiguration", + audio: { + input: { encoding: "linear32", sample_rate: 48000 }, + output: { encoding: "linear16", sample_rate: 48000, container: "none" }, + }, + agent: { + listen: { model: "nova-2" }, + think: { + provider: { + type: model.split("+")[0], + }, + model: model.split("+")[1], + instructions: + "You are a helpful assistant who responds in 1-2 sentences at most each time.", + }, + speak: { model: voice }, + }, + }); + + // WebSocket setup + const { sendMessage, lastMessage, readyState, getWebSocket } = useWebSocket( + socketURL, + { + // protocols: ["token", DEEPGRAM_API_KEY], + protocols: apiKey ? ["token", apiKey] : undefined, + share: true, + onOpen: () => { + console.log(apiKey); + const socket = getWebSocket(); + if (socket instanceof WebSocket) { + socket.binaryType = "arraybuffer"; + } + setConnection(true); + sendMessage(JSON.stringify(configSettings)); + startPingInterval(); + }, + onError: (error) => { + console.log(apiKey); + console.error("WebSocket error:", error); + stopPingInterval(); + }, + onClose: () => stopPingInterval(), + onMessage: handleWebSocketMessage, + retryOnError: true, + } + ); + + // WebSocket message handler + function handleWebSocketMessage(event: MessageEvent) { + if (typeof event.data === "string") { + const msgObj = JSON.parse(event.data); + const { type: messageType } = msgObj; + + switch (messageType) { + case "UserStartedSpeaking": + setCurrentSpeaker("user"); + clearScheduledAudio(); + incomingMessage.current = null; + break; + case "AgentStartedSpeaking": + setCurrentSpeaker("model"); + break; + case "ConversationText": + if (msgObj.content && msgObj.role === "user") { + setChatMessages((prev) => [ + ...prev, + { ...msgObj, id: Date.now().toString() }, + ]); + } else if (msgObj.content) { + let text = msgObj.content; + if (incomingMessage.current) { + incomingMessage.current = { + ...incomingMessage.current, + content: incomingMessage.current.content + text, + }; + setChatMessages((prev) => { + const updatedMessages = [...prev]; + const index = updatedMessages.findIndex( + (item) => item.id === incomingMessage.current?.id + ); + console.log("index"); + console.log(index); + if (index !== -1) { + updatedMessages[index] = { + ...incomingMessage.current, + } as Message; + } + return updatedMessages; + }); + } else { + incomingMessage.current = { + ...msgObj, + voice, + id: Date.now().toString(), + }; + } + } + break; + case "AgentAudioDone": + const ms = { ...incomingMessage.current }; + if (ms && Object.keys(ms).length) { + setChatMessages((p) => [...p, ms as Message]); + } + setCurrentSpeaker("user-waiting"); + incomingMessage.current = null; + break; + } + } else if (event.data instanceof ArrayBuffer) { + if (incomingMessage.current) { + incomingMessage.current.audio = incomingMessage.current.audio + ? concatArrayBuffers(incomingMessage.current.audio, event.data) + : event.data; + } + playAudio(event.data); + } + } + + const playAudio = useCallback( + (audioData: ArrayBuffer) => { + if (!audioContextRef.current) return; + + const audioContext = audioContextRef.current; + const audioDataView = new Int16Array(audioData); + + if (audioDataView.length === 0) { + console.error("Received audio data is empty."); + return; + } + + const audioBuffer = audioContext.createBuffer( + 1, + audioDataView.length, + 48000 + ); + const audioBufferChannel = audioBuffer.getChannelData(0); + + for (let i = 0; i < audioDataView.length; i++) { + audioBufferChannel[i] = audioDataView[i] / 32768; // Convert linear16 PCM to float [-1, 1] + } + + const source = audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + + // Start audio playback + const currentTime = audioContext.currentTime; + if (startTime < currentTime) { + setStartTime(currentTime); + } + source.start(startTime); + + // Update the start time for the next audio + setStartTime((prevStartTime) => prevStartTime + audioBuffer.duration); + scheduledAudioSourcesRef.current.push(source); + }, + [startTime] + ); + + const replayAudio = useCallback((audioData: ArrayBuffer) => { + const audioContext = new (window.AudioContext || + window.webkitAudioContext)(); + const audioDataView = new Int16Array(audioData); + + if (audioDataView.length === 0) { + console.error("Received audio data is empty."); + audioContext.close(); + return; + } + + const audioBuffer = audioContext.createBuffer( + 1, + audioDataView.length, + 48000 + ); + const audioBufferChannel = audioBuffer.getChannelData(0); + + for (let i = 0; i < audioDataView.length; i++) { + audioBufferChannel[i] = audioDataView[i] / 32768; // Convert linear16 PCM to float [-1, 1] + } + + const source = audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + + source.onended = () => { + source.disconnect(); + audioContext.close().catch((error) => { + console.error("Error closing AudioContext:", error); + }); + }; + + source.start(); + + // Return a function to stop playback if needed + return () => { + if (source.buffer) { + source.stop(); + source.disconnect(); + } + if (audioContext.state !== "closed") { + audioContext.close().catch((error) => { + console.error("Error closing AudioContext:", error); + }); + } + }; + }, []); + + const clearScheduledAudio = useCallback(() => { + scheduledAudioSourcesRef.current.forEach((source) => { + source.stop(); + source.onended = null; + }); + scheduledAudioSourcesRef.current = []; + + const scheduledAudioMs = Math.round( + 1000 * (startTime - (audioContextRef.current?.currentTime || 0)) + ); + if (scheduledAudioMs > 0) { + console.log(`Cleared ${scheduledAudioMs}ms of scheduled audio`); + } else { + console.log("No scheduled audio to clear."); + } + + setStartTime(0); + }, [startTime]); + + // Utility functions + const startPingInterval = useCallback(() => { + pingIntervalRef.current = setInterval(() => { + sendMessage(JSON.stringify({ type: "KeepAlive" })); + }, PING_INTERVAL); + }, [sendMessage]); + + const stopPingInterval = useCallback(() => { + if (pingIntervalRef.current) { + clearInterval(pingIntervalRef.current); + pingIntervalRef.current = null; + } + }, []); + + // Streaming functions + const startStreaming = useCallback(async () => { + if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { + console.error("getUserMedia is not supported in this browser."); + return; + } + + setMicrophoneOpen(true); + stopPingInterval(); + + const audioContext = new AudioContext(); + audioContextRef.current = audioContext; + + if (audioContext.state === "suspended") { + await audioContext.resume(); + } + + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: 48000, + channelCount: 1, + echoCancellation: true, + autoGainControl: true, + noiseSuppression: false, + }, + }); + + streamRef.current = stream; + + const microphone = audioContext.createMediaStreamSource(stream); + microphoneRef.current = microphone; + + const processor = audioContext.createScriptProcessor(2048, 1, 1); + processorRef.current = processor; + + processor.onaudioprocess = (event) => { + const inputData = event.inputBuffer.getChannelData(0); + sendMessage(inputData.buffer); + }; + + microphone.connect(processor); + processor.connect(audioContext.destination); + } catch (err) { + console.error("Error accessing microphone:", err); + if (audioContextRef.current) { + audioContextRef.current.close(); + } + setMicrophoneOpen(false); + } + }, [sendMessage, stopPingInterval]); + + const stopStreaming = useCallback(() => { + if (processorRef.current) { + processorRef.current.disconnect(); + processorRef.current.onaudioprocess = null; + } + startPingInterval(); + if (microphoneRef.current) { + microphoneRef.current.disconnect(); + } + if (audioContextRef.current) { + audioContextRef.current + .close() + .catch((err) => console.error("Error closing audio context:", err)); + audioContextRef.current = null; + } + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + setConnection(false); + setCurrentSpeaker(null); + setMicrophoneOpen(false); + }, [startPingInterval]); + + const updateVoice = useCallback( + (newVoice: string) => { + stopStreaming(); + setVoice(newVoice); + setCurrentSpeaker(null); + }, + [stopStreaming] + ); + + const updateModel = useCallback( + (newModel: string) => { + stopStreaming(); + setModel(newModel); + setCurrentSpeaker(null); + }, + [stopStreaming] + ); + + // Effects + + // Effect to fetch API key + useEffect(() => { + if (token) { + const fetchApiKey = async () => { + try { + const key = await getApiKey(token as string); + setApiKey(key); + } catch (error) { + console.error("Failed to fetch API key:", error); + } + }; + + fetchApiKey(); + } + }, [token]); + + // Effect to update socket URL when API key is available + useEffect(() => { + if (apiKey) { + setSocketUrl(`${DEEPGRAM_SOCKET_URL}?t=${Date.now()}`); + } + }, [apiKey]); + + useEffect(() => { + const [provider, modelName] = model.split("+"); + const newSettings = { + ...configSettings, + agent: { + ...configSettings.agent, + think: { + ...configSettings.agent.think, + provider: { + type: provider, + }, + model: modelName, + }, + speak: { model: voice }, + }, + }; + + if (JSON.stringify(newSettings) !== JSON.stringify(configSettings)) { + setConfigSettings(newSettings); + setSocketUrl(`${DEEPGRAM_SOCKET_URL}?t=${Date.now()}`); + } + }, [model, voice, configSettings]); + + useEffect(() => { + return () => stopPingInterval(); + }, [stopPingInterval]); + + // Context value + const value = useMemo( + () => ({ + sendMessage, + lastMessage, + readyState, + startStreaming, + stopStreaming, + connection, + voice, + model, + currentSpeaker, + microphoneOpen, + chatMessages, + setModel: updateModel, + setVoice: updateVoice, + replayAudio, + }), + [ + sendMessage, + lastMessage, + readyState, + startStreaming, + stopStreaming, + connection, + voice, + model, + currentSpeaker, + microphoneOpen, + chatMessages, + updateModel, + updateVoice, + replayAudio, + ] + ); + + return ( + + {children} + + ); +}; + +// Custom hook +export const useWebSocketContext = (): WebSocketContextValue => { + const context = useContext(WebSocketContext); + if (context === undefined) { + throw new Error( + "useWebSocketContext must be used within a WebSocketProvider" + ); + } + return context; +}; diff --git a/app/layout.tsx b/app/layout.tsx index 4c4bfb7a..4816d769 100644 --- a/app/layout.tsx +++ b/app/layout.tsx @@ -1,20 +1,16 @@ import { GoogleTagManager } from "@next/third-parties/google"; import { Inter } from "next/font/google"; -import { NowPlayingContextProvider } from "react-nowplaying"; import classNames from "classnames"; import localFont from "next/font/local"; import Script from "next/script"; - -import { DeepgramContextProvider } from "./context/Deepgram"; -import { MessageMetadataContextProvider } from "./context/MessageMetadata"; -import { MicrophoneContextProvider } from "./context/Microphone"; -import { AudioStoreContextProvider } from "./context/AudioStore"; import { ToastContextProvider } from "./context/Toast"; -import 'react-toastify/dist/ReactToastify.css'; +import "react-toastify/dist/ReactToastify.css"; import "./globals.css"; import type { Metadata, Viewport } from "next"; +import { WebSocketProvider } from "./context/WebSocketContext"; +import { AuthContextProvider } from "./context/Auth"; const inter = Inter({ subsets: ["latin"] }); const favorit = localFont({ @@ -53,15 +49,9 @@ export default function RootLayout({ )}`} > - - - - - {children} - - - - + + {children} +