From 8882619742b8596cc016be084a51049fd1687f8f Mon Sep 17 00:00:00 2001
From: Willy Douhard <willy.douhard@gmail.com>
Date: Fri, 4 Oct 2024 18:22:53 +0200
Subject: [PATCH] Willy/realtime (#1401)

* Bump literalai depdendency.

* Update literalai imports

* Remove unused imports from SQLAlchemy tests.

* Unit tests for LiteralDataLayer.

* Consistent LiteralAI to Chainlit conversion, resolve PaginatedResponse exceptions.

- Create LiteralToChainlitConverter class for handling conversions
- Implement methods for converting steps, threads, and attachments
- Add support for different Element subclasses based on metadata
- Allow manual setting of thread_id and id for Step and Element

* Attempt to satisfy mypy (plus cleaner approach).

* feat: add realtime audio

* fix: default config

* fix: lint

---------

Co-authored-by: Mathijs de Bruin <mathijs@mathijsfietst.nl>
Co-authored-by: EWouters <6179932+EWouters@users.noreply.github.com>
---
 backend/chainlit/__init__.py                  |   7 +-
 backend/chainlit/callbacks.py                 |  17 +-
 backend/chainlit/config.py                    |  27 +-
 backend/chainlit/emitter.py                   |  25 +
 backend/chainlit/socket.py                    |  36 +-
 backend/chainlit/translations/en-US.json      |   3 +-
 backend/chainlit/types.py                     |  12 +-
 backend/pyproject.toml                        |   2 +-
 frontend/src/assets/microphone.tsx            |   2 +-
 frontend/src/assets/microphoneOff.tsx         |  26 +
 .../molecules/messages/Messages.tsx           |   2 +
 .../organisms/chat/inputBox/AudioPresence.tsx | 123 ++++
 .../chat/inputBox/MicButton/RecordScreen.tsx  |  58 --
 .../chat/inputBox/MicButton/index.tsx         |  98 ++--
 .../organisms/chat/inputBox/footer.tsx        |  26 +
 .../organisms/chat/inputBox/index.tsx         |  16 +-
 .../organisms/chat/inputBox/waterMark.tsx     |  43 +-
 frontend/src/components/organisms/header.tsx  |  18 +-
 frontend/src/pages/ResumeButton.tsx           |   4 +-
 libs/copilot/src/components/Input.tsx         |   4 +-
 libs/react-client/src/index.ts                |   2 +
 libs/react-client/src/state.ts                |  23 +
 libs/react-client/src/types/audio.ts          |   5 +
 libs/react-client/src/types/config.ts         |   6 +-
 libs/react-client/src/useAudio.ts             | 218 +------
 libs/react-client/src/useChatInteract.ts      |  21 +-
 libs/react-client/src/useChatSession.ts       |  48 +-
 .../src/wavtools/analysis/audio_analysis.js   | 203 +++++++
 .../src/wavtools/analysis/constants.js        |  60 ++
 libs/react-client/src/wavtools/index.ts       |   7 +
 libs/react-client/src/wavtools/wav_packer.js  | 113 ++++
 .../react-client/src/wavtools/wav_recorder.js | 548 ++++++++++++++++++
 .../react-client/src/wavtools/wav_renderer.ts | 132 +++++
 .../src/wavtools/wav_stream_player.js         | 162 ++++++
 .../src/wavtools/worklets/audio_processor.js  | 214 +++++++
 .../src/wavtools/worklets/stream_processor.js |  96 +++
 36 files changed, 2007 insertions(+), 400 deletions(-)
 create mode 100644 frontend/src/assets/microphoneOff.tsx
 create mode 100644 frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx
 delete mode 100644 frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx
 create mode 100644 frontend/src/components/organisms/chat/inputBox/footer.tsx
 create mode 100644 libs/react-client/src/types/audio.ts
 create mode 100644 libs/react-client/src/wavtools/analysis/audio_analysis.js
 create mode 100644 libs/react-client/src/wavtools/analysis/constants.js
 create mode 100644 libs/react-client/src/wavtools/index.ts
 create mode 100644 libs/react-client/src/wavtools/wav_packer.js
 create mode 100644 libs/react-client/src/wavtools/wav_recorder.js
 create mode 100644 libs/react-client/src/wavtools/wav_renderer.ts
 create mode 100644 libs/react-client/src/wavtools/wav_stream_player.js
 create mode 100644 libs/react-client/src/wavtools/worklets/audio_processor.js
 create mode 100644 libs/react-client/src/wavtools/worklets/stream_processor.js

diff --git a/backend/chainlit/__init__.py b/backend/chainlit/__init__.py
index 0506ef38f3..5455fac760 100644
--- a/backend/chainlit/__init__.py
+++ b/backend/chainlit/__init__.py
@@ -43,7 +43,7 @@
 )
 from chainlit.step import Step, step
 from chainlit.sync import make_async, run_sync
-from chainlit.types import AudioChunk, ChatProfile, Starter
+from chainlit.types import InputAudioChunk, OutputAudioChunk, ChatProfile, Starter
 from chainlit.user import PersistedUser, User
 from chainlit.user_session import user_session
 from chainlit.utils import make_module_getattr
@@ -56,6 +56,7 @@
     author_rename,
     header_auth_callback,
     oauth_callback,
+    on_audio_start,
     on_audio_chunk,
     on_audio_end,
     on_chat_end,
@@ -117,7 +118,8 @@ def acall(self):
     "user_session",
     "chat_context",
     "CopilotFunction",
-    "AudioChunk",
+    "InputAudioChunk",
+    "OutputAudioChunk",
     "Action",
     "User",
     "PersistedUser",
@@ -176,6 +178,7 @@ def acall(self):
     "set_chat_profiles",
     "set_starters",
     "on_chat_end",
+    "on_audio_start",
     "on_audio_chunk",
     "on_audio_end",
     "author_rename",
diff --git a/backend/chainlit/callbacks.py b/backend/chainlit/callbacks.py
index b559049d7b..02c6feb124 100644
--- a/backend/chainlit/callbacks.py
+++ b/backend/chainlit/callbacks.py
@@ -209,13 +209,25 @@ def on_chat_end(func: Callable) -> Callable:
     return func
 
 
+@trace
+def on_audio_start(func: Callable) -> Callable:
+    """
+    Hook to react to the user initiating audio.
+
+    Returns:
+        Callable[], Any]: The decorated hook.
+    """
+
+    config.code.on_audio_start = wrap_user_function(func, with_task=False)
+    return func
+
 @trace
 def on_audio_chunk(func: Callable) -> Callable:
     """
     Hook to react to the audio chunks being sent.
 
     Args:
-        chunk (AudioChunk): The audio chunk being sent.
+        chunk (InputAudioChunk): The audio chunk being sent.
 
     Returns:
         Callable[], Any]: The decorated hook.
@@ -230,9 +242,6 @@ def on_audio_end(func: Callable) -> Callable:
     """
     Hook to react to the audio stream ending. This is called after the last audio chunk is sent.
 
-    Args:
-    elements ([List[Element]): The files that were uploaded before starting the audio stream (if any).
-
     Returns:
         Callable[], Any]: The decorated hook.
     """
diff --git a/backend/chainlit/config.py b/backend/chainlit/config.py
index 5700479677..60bc764b47 100644
--- a/backend/chainlit/config.py
+++ b/backend/chainlit/config.py
@@ -28,9 +28,8 @@
 
 if TYPE_CHECKING:
     from chainlit.action import Action
-    from chainlit.element import ElementBased
     from chainlit.message import Message
-    from chainlit.types import AudioChunk, ChatProfile, Starter, ThreadDict
+    from chainlit.types import InputAudioChunk, ChatProfile, Starter, ThreadDict
     from chainlit.user import User
     from fastapi import Request, Response
 
@@ -93,18 +92,8 @@
     max_size_mb = 500
 
 [features.audio]
-    # Threshold for audio recording
-    min_decibels = -45
-    # Delay for the user to start speaking in MS
-    initial_silence_timeout = 3000
-    # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
-    silence_timeout = 1500
-    # Above this duration (MS), the recording will forcefully stop.
-    max_duration = 15000
-    # Duration of the audio chunks in MS
-    chunk_duration = 1000
     # Sample rate of the audio
-    sample_rate = 44100
+    sample_rate = 24000
 
 [UI]
 # Name of the assistant.
@@ -237,12 +226,7 @@ class SpontaneousFileUploadFeature(DataClassJsonMixin):
 
 @dataclass
 class AudioFeature(DataClassJsonMixin):
-    min_decibels: int = -45
-    initial_silence_timeout: int = 2000
-    silence_timeout: int = 1500
-    chunk_duration: int = 1000
-    max_duration: int = 15000
-    sample_rate: int = 44100
+    sample_rate: int = 24000
     enabled: bool = False
 
 
@@ -297,8 +281,9 @@ class CodeSettings:
     on_chat_end: Optional[Callable[[], Any]] = None
     on_chat_resume: Optional[Callable[["ThreadDict"], Any]] = None
     on_message: Optional[Callable[["Message"], Any]] = None
-    on_audio_chunk: Optional[Callable[["AudioChunk"], Any]] = None
-    on_audio_end: Optional[Callable[[List["ElementBased"]], Any]] = None
+    on_audio_start: Optional[Callable[[], Any]] = None
+    on_audio_chunk: Optional[Callable[["InputAudioChunk"], Any]] = None
+    on_audio_end: Optional[Callable[[], Any]] = None
 
     author_rename: Optional[Callable[[str], Awaitable[str]]] = None
     on_settings_update: Optional[Callable[[Dict[str, Any]], Any]] = None
diff --git a/backend/chainlit/emitter.py b/backend/chainlit/emitter.py
index e80bd46dd2..df8a78e9f4 100644
--- a/backend/chainlit/emitter.py
+++ b/backend/chainlit/emitter.py
@@ -17,6 +17,7 @@
     FileReference,
     MessagePayload,
     ThreadDict,
+    OutputAudioChunk
 )
 from chainlit.user import PersistedUser
 from literalai.helper import utc_now
@@ -51,6 +52,18 @@ async def resume_thread(self, thread_dict: ThreadDict):
     async def send_element(self, element_dict: ElementDict):
         """Stub method to send an element to the UI."""
         pass
+    
+    async def update_audio_connection(self, state: Literal["on", "off"]):
+        """Audio connection signaling."""
+        pass
+    
+    async def send_audio_chunk(self, chunk: OutputAudioChunk):
+        """Stub method to send an audio chunk to the UI."""
+        pass
+        
+    async def send_audio_interrupt(self):
+        """Stub method to interrupt the current audio response."""
+        pass
 
     async def send_step(self, step_dict: StepDict):
         """Stub method to send a message to the UI."""
@@ -157,6 +170,18 @@ def resume_thread(self, thread_dict: ThreadDict):
         """Send a thread to the UI to resume it"""
         return self.emit("resume_thread", thread_dict)
 
+    async def update_audio_connection(self, state: Literal["on", "off"]):
+        """Audio connection signaling."""
+        await self.emit("audio_connection", state)
+
+    async def send_audio_chunk(self, chunk: OutputAudioChunk):
+        """Send an audio chunk to the UI."""
+        await self.emit("audio_chunk", chunk)
+        
+    async def send_audio_interrupt(self):
+        """Method to interrupt the current audio response."""
+        await self.emit("audio_interrupt", {})
+
     async def send_element(self, element_dict: ElementDict):
         """Stub method to send an element to the UI."""
         await self.emit("element", element_dict)
diff --git a/backend/chainlit/socket.py b/backend/chainlit/socket.py
index 952fd38ae9..4cfc42fa9a 100644
--- a/backend/chainlit/socket.py
+++ b/backend/chainlit/socket.py
@@ -18,9 +18,8 @@
 from chainlit.session import WebsocketSession
 from chainlit.telemetry import trace_event
 from chainlit.types import (
-    AudioChunk,
-    AudioChunkPayload,
-    AudioEndPayload,
+    InputAudioChunk,
+    InputAudioChunkPayload,
     MessagePayload,
 )
 from chainlit.user_session import user_sessions
@@ -314,19 +313,31 @@ async def message(sid, payload: MessagePayload):
     session.current_task = task
 
 
+@sio.on("audio_start")
+async def audio_start(sid):
+    """Handle audio init."""
+    session = WebsocketSession.require(sid)
+
+    context = init_ws_context(session)
+    if config.code.on_audio_start:
+       connected = bool(await config.code.on_audio_start())
+       connection_state = "on" if connected else "off"
+       await context.emitter.update_audio_connection(connection_state)
+        
+
 @sio.on("audio_chunk")
-async def audio_chunk(sid, payload: AudioChunkPayload):
+async def audio_chunk(sid, payload: InputAudioChunkPayload):
     """Handle an audio chunk sent by the user."""
     session = WebsocketSession.require(sid)
 
     init_ws_context(session)
 
     if config.code.on_audio_chunk:
-        asyncio.create_task(config.code.on_audio_chunk(AudioChunk(**payload)))
+        asyncio.create_task(config.code.on_audio_chunk(InputAudioChunk(**payload)))
 
 
 @sio.on("audio_end")
-async def audio_end(sid, payload: AudioEndPayload):
+async def audio_end(sid):
     """Handle the end of the audio stream."""
     session = WebsocketSession.require(sid)
     try:
@@ -337,18 +348,9 @@ async def audio_end(sid, payload: AudioEndPayload):
             session.has_first_interaction = True
             asyncio.create_task(context.emitter.init_thread("audio"))
 
-        file_elements = []
         if config.code.on_audio_end:
-            file_refs = payload.get("fileReferences")
-            if file_refs:
-                files = [
-                    session.files[file["id"]]
-                    for file in file_refs
-                    if file["id"] in session.files
-                ]
-                file_elements = [Element.from_dict(file) for file in files]
-
-            await config.code.on_audio_end(file_elements)
+            await config.code.on_audio_end()
+            
     except asyncio.CancelledError:
         pass
     except Exception as e:
diff --git a/backend/chainlit/translations/en-US.json b/backend/chainlit/translations/en-US.json
index 2bb9b6f4d8..f8a7fd096e 100644
--- a/backend/chainlit/translations/en-US.json
+++ b/backend/chainlit/translations/en-US.json
@@ -124,7 +124,8 @@
           },
           "speechButton": {
             "start": "Start recording",
-            "stop": "Stop recording"
+            "stop": "Stop recording",
+            "loading": "Connecting"
           },
           "SubmitButton": {
             "sendMessage": "Send message",
diff --git a/backend/chainlit/types.py b/backend/chainlit/types.py
index 1ca9d18bb6..b094716f93 100644
--- a/backend/chainlit/types.py
+++ b/backend/chainlit/types.py
@@ -154,7 +154,7 @@ class MessagePayload(TypedDict):
     fileReferences: Optional[List[FileReference]]
 
 
-class AudioChunkPayload(TypedDict):
+class InputAudioChunkPayload(TypedDict):
     isStart: bool
     mimeType: str
     elapsedTime: float
@@ -162,16 +162,16 @@ class AudioChunkPayload(TypedDict):
 
 
 @dataclass
-class AudioChunk:
+class InputAudioChunk:
     isStart: bool
     mimeType: str
     elapsedTime: float
     data: bytes
 
-
-class AudioEndPayload(TypedDict):
-    fileReferences: Optional[List[FileReference]]
-
+class OutputAudioChunk(TypedDict):
+    track: str
+    mimeType: str
+    data: bytes
 
 @dataclass
 class AskFileResponse:
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 503bd6ed43..c8750e9956 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "chainlit"
-version = "1.3.0rc0"
+version = "1.3.0rc1"
 keywords = [
     'LLM',
     'Agents',
diff --git a/frontend/src/assets/microphone.tsx b/frontend/src/assets/microphone.tsx
index 226b649e07..04df14e711 100644
--- a/frontend/src/assets/microphone.tsx
+++ b/frontend/src/assets/microphone.tsx
@@ -15,7 +15,7 @@ const MicrophoneIcon = (props: SvgIconProps) => {
     >
       <path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z" />
       <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
-      <line x1="12" x2="12" y1="19" y2="22" />{' '}
+      <line x1="12" x2="12" y1="19" y2="22" />
     </SvgIcon>
   );
 };
diff --git a/frontend/src/assets/microphoneOff.tsx b/frontend/src/assets/microphoneOff.tsx
new file mode 100644
index 0000000000..10141bab3f
--- /dev/null
+++ b/frontend/src/assets/microphoneOff.tsx
@@ -0,0 +1,26 @@
+import SvgIcon, { SvgIconProps } from '@mui/material/SvgIcon';
+
+const MicrophoneOffIcon = (props: SvgIconProps) => {
+  return (
+    <SvgIcon
+      {...props}
+      style={{
+        strokeLinecap: 'round',
+        strokeLinejoin: 'round',
+        strokeWidth: 2,
+        fill: 'none',
+        stroke: 'currentColor'
+      }}
+      viewBox="0 0 24 24"
+    >
+      <line x1="2" x2="22" y1="2" y2="22" />
+      <path d="M18.89 13.23A7.12 7.12 0 0 0 19 12v-2" />
+      <path d="M5 10v2a7 7 0 0 0 12 5" />
+      <path d="M15 9.34V5a3 3 0 0 0-5.68-1.33" />
+      <path d="M9 9v3a3 3 0 0 0 5.12 2.12" />
+      <line x1="12" x2="12" y1="19" y2="22" />
+    </SvgIcon>
+  );
+};
+
+export default MicrophoneOffIcon;
diff --git a/frontend/src/components/molecules/messages/Messages.tsx b/frontend/src/components/molecules/messages/Messages.tsx
index 024874299d..2c3063d443 100644
--- a/frontend/src/components/molecules/messages/Messages.tsx
+++ b/frontend/src/components/molecules/messages/Messages.tsx
@@ -62,6 +62,7 @@ const Messages = memo(
               <>
                 {m.steps?.length ? (
                   <Messages
+                    key={m.id}
                     messages={m.steps}
                     elements={elements}
                     actions={actions}
@@ -71,6 +72,7 @@ const Messages = memo(
                   />
                 ) : null}
                 <MessageLoader
+                  key={m.id + 'loader'}
                   show={showToolCoTLoader || showHiddenCoTLoader}
                 />
               </>
diff --git a/frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx b/frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx
new file mode 100644
index 0000000000..44ad6ae5f9
--- /dev/null
+++ b/frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx
@@ -0,0 +1,123 @@
+import { useEffect, useRef } from 'react';
+
+import { Box, Typography } from '@mui/material';
+import { useTheme } from '@mui/material/styles';
+
+import { WavRenderer, useAudio } from '@chainlit/react-client';
+
+interface Props {
+  type: 'client' | 'server';
+  height: number;
+  width: number;
+  barCount: number;
+  barSpacing: number;
+}
+
+export default function AudioPresence({
+  type,
+  height,
+  width,
+  barCount,
+  barSpacing
+}: Props) {
+  const theme = useTheme();
+  const { wavRecorder, wavStreamPlayer, isAiSpeaking } = useAudio();
+  const canvasRef = useRef<HTMLCanvasElement>(null);
+
+  width = type === 'server' && !isAiSpeaking ? height : width;
+
+  useEffect(() => {
+    let isLoaded = true;
+    const dpr = window.devicePixelRatio || 1;
+    let bounceDirection = 1;
+    let bounceFactor = 0;
+
+    const getData = () => {
+      if (type === 'server' && isAiSpeaking) {
+        return wavStreamPlayer.analyser
+          ? wavStreamPlayer.getFrequencies('voice')
+          : { values: new Float32Array([0]) };
+      } else {
+        return wavRecorder.recording
+          ? wavRecorder.getFrequencies('voice')
+          : { values: new Float32Array([0]) };
+      }
+    };
+
+    const render = () => {
+      if (!isLoaded) return;
+      const canvas = canvasRef.current;
+      let ctx: CanvasRenderingContext2D | null = null;
+
+      if (canvas) {
+        // Set the canvas size based on the DPR
+        canvas.width = width * dpr;
+        canvas.height = height * dpr;
+        canvas.style.width = `${width}px`;
+        canvas.style.height = `${height}px`;
+
+        ctx = ctx || canvas.getContext('2d');
+        if (ctx) {
+          // Scale the context to account for the DPR
+          ctx.scale(dpr, dpr);
+
+          ctx.clearRect(0, 0, width, height); // Use CSS dimensions here
+          const result = getData();
+
+          if (type === 'server' && !isAiSpeaking) {
+            // Draw a bouncing circle
+            const amplitude = Math.min(
+              Math.max(0.6, Math.max(...result.values)),
+              1
+            ); // Ensure a minimum amplitude
+            const maxRadius = width / 2;
+            const baseRadius = maxRadius * amplitude;
+            const radius = baseRadius * (0.6 + 0.2 * bounceFactor);
+            const centerX = width / 2;
+            const centerY = height / 2;
+
+            ctx.fillStyle = theme.palette.text.primary;
+            ctx.beginPath();
+            ctx.arc(centerX, centerY, radius, 0, Math.PI * 2);
+            ctx.fill();
+
+            const newFactor = bounceFactor + 0.01 * bounceDirection;
+            if (newFactor > 1 || newFactor < 0) {
+              bounceDirection *= -1;
+            }
+            bounceFactor = Math.max(0, Math.min(newFactor, 1));
+          } else {
+            WavRenderer.drawBars(
+              ctx,
+              result.values,
+              width,
+              height,
+              theme.palette.text.primary,
+              barCount,
+              0,
+              barSpacing,
+              true
+            );
+          }
+        }
+      }
+      window.requestAnimationFrame(render);
+    };
+    render();
+
+    return () => {
+      isLoaded = false;
+    };
+  }, [height, width, barCount, barSpacing, theme, wavRecorder, isAiSpeaking]);
+
+  return (
+    <Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
+      {type === 'server' && !isAiSpeaking ? (
+        <Typography fontSize="12px" color="text.secondary">
+          Listening
+        </Typography>
+      ) : null}
+      <canvas ref={canvasRef} />
+    </Box>
+  );
+}
diff --git a/frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx b/frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx
deleted file mode 100644
index fb25d2a030..0000000000
--- a/frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx
+++ /dev/null
@@ -1,58 +0,0 @@
-import { grey } from 'theme/palette';
-
-import { Box } from '@mui/material';
-import Backdrop from '@mui/material/Backdrop';
-
-import MicrophoneIcon from 'assets/microphone';
-
-interface Props {
-  open?: boolean;
-  isSpeaking?: boolean;
-}
-
-export default function RecordScreen({ open, isSpeaking }: Props) {
-  return (
-    <Backdrop
-      sx={{ color: '#fff', zIndex: (theme) => theme.zIndex.drawer + 1 }}
-      open={!!open}
-    >
-      <Box
-        height={300}
-        width={300}
-        position="relative"
-        display="flex"
-        justifyContent="center"
-        alignItems="center"
-      >
-        <svg height="160" width="160" xmlns="http://www.w3.org/2000/svg">
-          <circle r="80" cx="80" cy="80" fill={grey[50]} />
-        </svg>
-        <svg
-          height="240"
-          width="240"
-          xmlns="http://www.w3.org/2000/svg"
-          style={{
-            position: 'absolute',
-            zIndex: -1,
-            transition: 'transform 500ms ease-in-out',
-            opacity: 0.5,
-            transform: isSpeaking ? 'scale(1)' : 'scale(0)'
-          }}
-        >
-          <circle r="120" cx="120" cy="120" fill={grey[50]} />
-        </svg>
-        <MicrophoneIcon
-          sx={{
-            height: 87,
-            width: 87,
-            color: 'primary.main',
-            position: 'absolute',
-            top: '50%',
-            left: '50%',
-            transform: 'translate(-50%, -50%)'
-          }}
-        />
-      </Box>
-    </Backdrop>
-  );
-}
diff --git a/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx b/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx
index 8be273e7dc..6567b95e47 100644
--- a/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx
+++ b/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx
@@ -1,85 +1,87 @@
-import { useCallback, useEffect, useMemo } from 'react';
 import { useHotkeys } from 'react-hotkeys-hook';
-import { useRecoilState, useRecoilValue } from 'recoil';
-import { toast } from 'sonner';
 
-import { IconButton, Theme, Tooltip, useMediaQuery } from '@mui/material';
+import {
+  CircularProgress,
+  IconButton,
+  Theme,
+  Tooltip,
+  useMediaQuery
+} from '@mui/material';
 
-import { askUserState, useAudio, useConfig } from '@chainlit/react-client';
+import { useAudio, useConfig } from '@chainlit/react-client';
 
 import { Translator } from 'components/i18n';
 
 import MicrophoneIcon from 'assets/microphone';
-
-import { attachmentsState } from 'state/chat';
-
-import RecordScreen from './RecordScreen';
+import MicrophoneOffIcon from 'assets/microphoneOff';
 
 interface Props {
   disabled?: boolean;
 }
 
 const MicButton = ({ disabled }: Props) => {
-  const askUser = useRecoilValue(askUserState);
   const { config } = useConfig();
-  const {
-    startRecording: _startRecording,
-    isRecording,
-    isSpeaking,
-    isRecordingFinished,
-    error
-  } = useAudio(config?.features.audio);
-  const [attachments, setAttachments] = useRecoilState(attachmentsState);
-
-  disabled = disabled || !!askUser;
-
-  useEffect(() => {
-    if (isRecordingFinished) setAttachments([]);
-  }, [isRecordingFinished]);
-
-  useEffect(() => {
-    if (!error) return;
-    toast.error(error);
-  }, [error]);
-
-  const fileReferences = useMemo(() => {
-    return attachments
-      ?.filter((a) => !!a.serverId)
-      .map((a) => ({ id: a.serverId! }));
-  }, [attachments]);
-
-  const startRecording = useCallback(() => {
-    if (disabled) return;
-    _startRecording(fileReferences);
-  }, [_startRecording, fileReferences, disabled]);
-
-  useHotkeys('p', startRecording);
+  const { startConversation, endConversation, audioConnection } = useAudio();
+  const isEnabled = !!config?.features.audio.enabled;
+
+  useHotkeys(
+    'p',
+    () => {
+      if (!isEnabled) return;
+      if (audioConnection === 'on') return endConversation();
+      return startConversation();
+    },
+    [isEnabled, audioConnection, startConversation, endConversation]
+  );
 
   const size = useMediaQuery<Theme>((theme) => theme.breakpoints.down('sm'))
     ? 'small'
     : 'medium';
 
-  if (!config?.features.audio.enabled) return null;
+  if (!isEnabled) return null;
 
   return (
     <>
-      <RecordScreen open={isRecording} isSpeaking={isSpeaking} />
       <Tooltip
         title={
           <Translator
-            path="components.organisms.chat.inputBox.speechButton.start"
+            path={
+              audioConnection === 'on'
+                ? 'components.organisms.chat.inputBox.speechButton.stop'
+                : audioConnection === 'off'
+                ? 'components.organisms.chat.inputBox.speechButton.start'
+                : 'components.organisms.chat.inputBox.speechButton.loading'
+            }
             suffix=" (P)"
           />
         }
       >
         <span>
           <IconButton
-            disabled={disabled || isRecording}
+            disabled={disabled}
             color="inherit"
             size={size}
-            onClick={startRecording}
+            onClick={
+              audioConnection === 'on'
+                ? endConversation
+                : audioConnection === 'off'
+                ? startConversation
+                : undefined
+            }
           >
-            <MicrophoneIcon fontSize={size} />
+            {audioConnection === 'on' ? (
+              <MicrophoneOffIcon fontSize={size} />
+            ) : null}
+            {audioConnection === 'off' ? (
+              <MicrophoneIcon fontSize={size} />
+            ) : null}
+            {audioConnection === 'connecting' ? (
+              <CircularProgress
+                color="inherit"
+                variant="indeterminate"
+                size={18}
+              />
+            ) : null}
           </IconButton>
         </span>
       </Tooltip>
diff --git a/frontend/src/components/organisms/chat/inputBox/footer.tsx b/frontend/src/components/organisms/chat/inputBox/footer.tsx
new file mode 100644
index 0000000000..782af94e6d
--- /dev/null
+++ b/frontend/src/components/organisms/chat/inputBox/footer.tsx
@@ -0,0 +1,26 @@
+import { Stack } from '@mui/material';
+
+import { useAudio } from '@chainlit/react-client';
+
+import AudioPresence from 'components/organisms/chat/inputBox/AudioPresence';
+import WaterMark from 'components/organisms/chat/inputBox/waterMark';
+
+export default function InputBoxFooter() {
+  const { audioConnection } = useAudio();
+
+  return (
+    <Stack mx="auto" className="watermark">
+      {audioConnection === 'on' ? (
+        <AudioPresence
+          type="client"
+          height={18}
+          width={36}
+          barCount={4}
+          barSpacing={2}
+        />
+      ) : (
+        <WaterMark />
+      )}
+    </Stack>
+  );
+}
diff --git a/frontend/src/components/organisms/chat/inputBox/index.tsx b/frontend/src/components/organisms/chat/inputBox/index.tsx
index 66ae63a405..acb4274ed0 100644
--- a/frontend/src/components/organisms/chat/inputBox/index.tsx
+++ b/frontend/src/components/organisms/chat/inputBox/index.tsx
@@ -14,8 +14,8 @@ import { useLayoutMaxWidth } from 'hooks/useLayoutMaxWidth';
 import { IAttachment } from 'state/chat';
 import { inputHistoryState } from 'state/userInputHistory';
 
+import InputBoxFooter from './footer';
 import Input from './input';
-import WaterMark from './waterMark';
 
 interface Props {
   fileSpec: FileSpec;
@@ -38,7 +38,6 @@ const InputBox = memo(
 
     const { user } = useAuth();
     const { sendMessage, replyMessage } = useChatInteract();
-    // const tokenCount = useRecoilValue(tokenCountState);
 
     const onSubmit = useCallback(
       async (msg: string, attachments?: IAttachment[]) => {
@@ -122,19 +121,8 @@ const InputBox = memo(
             onSubmit={onSubmit}
             onReply={onReply}
           />
-          {/* {tokenCount > 0 && ( */}
-          {/* <Stack flexDirection="row" alignItems="center">
-          <Typography
-            sx={{ ml: 'auto' }}
-            color="text.secondary"
-            variant="caption"
-          >
-            Token usage: {tokenCount}
-          </Typography>
-        </Stack> */}
-          {/* )} */}
         </Box>
-        <WaterMark />
+        <InputBoxFooter />
       </Box>
     );
   }
diff --git a/frontend/src/components/organisms/chat/inputBox/waterMark.tsx b/frontend/src/components/organisms/chat/inputBox/waterMark.tsx
index d1436e3697..9db6d0a908 100644
--- a/frontend/src/components/organisms/chat/inputBox/waterMark.tsx
+++ b/frontend/src/components/organisms/chat/inputBox/waterMark.tsx
@@ -1,6 +1,6 @@
 import { useRecoilValue } from 'recoil';
 
-import { Stack, Typography } from '@mui/material';
+import { Typography } from '@mui/material';
 
 import { Translator } from 'components/i18n';
 
@@ -14,29 +14,28 @@ import { settingsState } from 'state/settings';
 export default function WaterMark() {
   const { theme } = useRecoilValue(settingsState);
   const Logo = theme === 'light' ? LogoLight : LogoDark;
+
   return (
-    <Stack mx="auto" className="watermark">
-      <a
-        href="https://github.com/Chainlit/chainlit"
-        target="_blank"
+    <a
+      href="https://github.com/Chainlit/chainlit"
+      target="_blank"
+      style={{
+        display: 'flex',
+        alignItems: 'center',
+        textDecoration: 'none'
+      }}
+    >
+      <Typography fontSize="12px" color="text.secondary">
+        <Translator path="components.organisms.chat.inputBox.waterMark.text" />
+      </Typography>
+      <Logo
         style={{
-          display: 'flex',
-          alignItems: 'center',
-          textDecoration: 'none'
+          width: 65,
+          height: 'auto',
+          filter: 'grayscale(1)',
+          marginLeft: '4px'
         }}
-      >
-        <Typography fontSize="12px" color="text.secondary">
-          <Translator path="components.organisms.chat.inputBox.waterMark.text" />
-        </Typography>
-        <Logo
-          style={{
-            width: 65,
-            height: 'auto',
-            filter: 'grayscale(1)',
-            marginLeft: '4px'
-          }}
-        />
-      </a>
-    </Stack>
+      />
+    </a>
   );
 }
diff --git a/frontend/src/components/organisms/header.tsx b/frontend/src/components/organisms/header.tsx
index d75a4d7bc5..ce5a2242a7 100644
--- a/frontend/src/components/organisms/header.tsx
+++ b/frontend/src/components/organisms/header.tsx
@@ -4,6 +4,8 @@ import { useRecoilValue } from 'recoil';
 import { Box, Stack } from '@mui/material';
 import useMediaQuery from '@mui/material/useMediaQuery';
 
+import { useAudio } from '@chainlit/react-client';
+
 import UserButton from 'components/atoms/buttons/userButton';
 import { Logo } from 'components/atoms/logo';
 import ChatProfiles from 'components/molecules/chatProfiles';
@@ -11,10 +13,12 @@ import NewChatButton from 'components/molecules/newChatButton';
 
 import { settingsState } from 'state/settings';
 
+import AudioPresence from './chat/inputBox/AudioPresence';
 import { OpenSideBarMobileButton } from './sidebar/OpenSideBarMobileButton';
 
 const Header = memo(() => {
   const isMobile = useMediaQuery('(max-width: 66rem)');
+  const { audioConnection } = useAudio();
   const { isChatHistoryOpen } = useRecoilValue(settingsState);
 
   return (
@@ -36,9 +40,21 @@ const Header = memo(() => {
           position: 'absolute',
           top: '50%',
           left: '50%',
-          transform: 'translate(-50%, -50%)'
+          transform: 'translate(-50%, -50%)',
+          display: 'flex',
+          alignItems: 'center',
+          gap: 1
         }}
       >
+        {audioConnection === 'on' ? (
+          <AudioPresence
+            type="server"
+            height={35}
+            width={70}
+            barCount={4}
+            barSpacing={2}
+          />
+        ) : null}
         <ChatProfiles />
       </Box>
       {isMobile ? (
diff --git a/frontend/src/pages/ResumeButton.tsx b/frontend/src/pages/ResumeButton.tsx
index 4d3931d6c2..c5c5418514 100644
--- a/frontend/src/pages/ResumeButton.tsx
+++ b/frontend/src/pages/ResumeButton.tsx
@@ -11,7 +11,7 @@ import {
 } from '@chainlit/react-client';
 
 import { Translator } from 'components/i18n';
-import WaterMark from 'components/organisms/chat/inputBox/waterMark';
+import InputBoxFooter from 'components/organisms/chat/inputBox/footer';
 
 import { useLayoutMaxWidth } from 'hooks/useLayoutMaxWidth';
 
@@ -66,7 +66,7 @@ export default function ResumeButton({ threadId }: Props) {
       <Button id="resumeThread" onClick={onClick} variant="contained">
         <Translator path="pages.ResumeButton.resumeChat" />
       </Button>
-      <WaterMark />
+      <InputBoxFooter />
     </Box>
   );
 }
diff --git a/libs/copilot/src/components/Input.tsx b/libs/copilot/src/components/Input.tsx
index bcb5a5a070..96b736f09b 100644
--- a/libs/copilot/src/components/Input.tsx
+++ b/libs/copilot/src/components/Input.tsx
@@ -10,7 +10,7 @@ import { Attachments } from '@chainlit/app/src/components/molecules/attachments'
 import MicButton from '@chainlit/app/src/components/organisms/chat/inputBox/MicButton';
 import { SubmitButton } from '@chainlit/app/src/components/organisms/chat/inputBox/SubmitButton';
 import UploadButton from '@chainlit/app/src/components/organisms/chat/inputBox/UploadButton';
-import WaterMark from '@chainlit/app/src/components/organisms/chat/inputBox/waterMark';
+import InputBoxFooter from '@chainlit/app/src/components/organisms/chat/inputBox/footer';
 import { IAttachment, attachmentsState } from '@chainlit/app/src/state/chat';
 import { chatSettingsOpenState } from '@chainlit/app/src/state/project';
 import { inputHistoryState } from '@chainlit/app/src/state/userInputHistory';
@@ -212,7 +212,7 @@ const Input = memo(
               <MicButton disabled={disabled} />
             </Stack>
             <Box>
-              <WaterMark />
+              <InputBoxFooter />
             </Box>
           </Stack>
         </Stack>
diff --git a/libs/react-client/src/index.ts b/libs/react-client/src/index.ts
index 42599509a4..083d74abe4 100644
--- a/libs/react-client/src/index.ts
+++ b/libs/react-client/src/index.ts
@@ -11,3 +11,5 @@ export * from './state';
 export * from './utils/message';
 
 export { Socket } from 'socket.io-client';
+
+export { WavRenderer } from './wavtools/wav_renderer';
diff --git a/libs/react-client/src/state.ts b/libs/react-client/src/state.ts
index 54ca14973c..e514daee27 100644
--- a/libs/react-client/src/state.ts
+++ b/libs/react-client/src/state.ts
@@ -16,6 +16,7 @@ import {
   ThreadHistory
 } from './types';
 import { groupByDate } from './utils/group';
+import { WavRecorder, WavStreamPlayer } from './wavtools';
 
 export interface ISession {
   socket: Socket;
@@ -76,6 +77,28 @@ export const askUserState = atom<IAsk | undefined>({
   default: undefined
 });
 
+export const wavRecorderState = atom({
+  key: 'WavRecorder',
+  dangerouslyAllowMutability: true,
+  default: new WavRecorder()
+});
+
+export const wavStreamPlayerState = atom({
+  key: 'WavStreamPlayer',
+  dangerouslyAllowMutability: true,
+  default: new WavStreamPlayer()
+});
+
+export const audioConnectionState = atom<'connecting' | 'on' | 'off'>({
+  key: 'AudioConnection',
+  default: 'off'
+});
+
+export const isAiSpeakingState = atom({
+  key: 'isAiSpeaking',
+  default: false
+});
+
 export const callFnState = atom<ICallFn | undefined>({
   key: 'CallFn',
   default: undefined
diff --git a/libs/react-client/src/types/audio.ts b/libs/react-client/src/types/audio.ts
new file mode 100644
index 0000000000..e53518084e
--- /dev/null
+++ b/libs/react-client/src/types/audio.ts
@@ -0,0 +1,5 @@
+export interface OutputAudioChunk {
+  track: string;
+  mimeType: string;
+  data: Int16Array;
+}
diff --git a/libs/react-client/src/types/config.ts b/libs/react-client/src/types/config.ts
index 3ae19354d9..0184e35c4e 100644
--- a/libs/react-client/src/types/config.ts
+++ b/libs/react-client/src/types/config.ts
@@ -14,11 +14,7 @@ export interface ChatProfile {
 
 export interface IAudioConfig {
   enabled: boolean;
-  min_decibels: number;
-  initial_silence_timeout: number;
-  silence_timeout: number;
-  chunk_duration: number;
-  max_duration: number;
+  sample_rate: number;
 }
 
 export interface IAuthConfig {
diff --git a/libs/react-client/src/useAudio.ts b/libs/react-client/src/useAudio.ts
index c777a09ec6..79a964f3f2 100644
--- a/libs/react-client/src/useAudio.ts
+++ b/libs/react-client/src/useAudio.ts
@@ -1,196 +1,42 @@
-import { useCallback, useRef, useState } from 'react';
-
-import { IAudioConfig, IFileRef } from './types';
+import { useCallback } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+
+import {
+  audioConnectionState,
+  isAiSpeakingState,
+  wavRecorderState,
+  wavStreamPlayerState
+} from './state';
 import { useChatInteract } from './useChatInteract';
 
-const defaultConfig: IAudioConfig = {
-  enabled: true,
-  min_decibels: -45,
-  initial_silence_timeout: 3000,
-  silence_timeout: 1500,
-  max_duration: 15000,
-  chunk_duration: 1000
-};
-
-const useAudio = (config = defaultConfig) => {
-  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
-  const cancelling = useRef(false);
-  const { sendAudioChunk, endAudioStream } = useChatInteract();
-  const [isRecording, setIsRecording] = useState(false);
-  const [timer, setTimer] = useState<NodeJS.Timeout | undefined>(undefined);
-  const [isSpeaking, setIsSpeaking] = useState(false);
-  const [error, setError] = useState<string | undefined>(undefined);
-  const [isRecordingFinished, setIsRecordingFinished] = useState(false);
-
-  const cancelRecording = useCallback(() => {
-    if (!isRecording || !mediaRecorderRef.current) {
-      return;
-    }
-    cancelling.current = true;
-    mediaRecorderRef.current.stop();
-  }, [isRecording]);
-
-  const stopRecording = useCallback(() => {
-    if (!isRecording || !mediaRecorderRef.current) {
-      return;
-    }
-    mediaRecorderRef.current.stop();
-  }, [isRecording]);
-
-  const startRecording = useCallback(
-    (fileReferences?: IFileRef[]) => {
-      if (isRecording || !config) {
-        return;
-      }
-      setIsRecordingFinished(false);
-      setError(undefined);
-      clearTimeout(timer);
-      cancelling.current = false;
-
-      const {
-        min_decibels,
-        silence_timeout,
-        initial_silence_timeout,
-        chunk_duration,
-        max_duration
-      } = config;
-
-      navigator.mediaDevices
-        .getUserMedia({ audio: true })
-        .then((stream) => {
-          let spokeAtLeastOnce = false;
-          let isSpeaking = false;
-          let isFirstChunk = true;
-          let audioBuffer: Blob | null = null;
-          let startTime = Date.now();
-
-          const mediaRecorder = new MediaRecorder(stream);
-          mediaRecorderRef.current = mediaRecorder;
-          mediaRecorder.addEventListener('start', () => {
-            setIsRecording(true);
-            startTime = Date.now();
-          });
-
-          mediaRecorder.addEventListener('dataavailable', async (event) => {
-            if (!spokeAtLeastOnce) {
-              if (!audioBuffer) {
-                audioBuffer = new Blob([event.data], { type: event.data.type });
-              } else {
-                audioBuffer = new Blob([audioBuffer, event.data], {
-                  type: event.data.type
-                });
-              }
-            }
-            if (mediaRecorder.state === 'inactive') {
-              return;
-            }
-            const elapsedTime = Date.now() - startTime;
-            if (elapsedTime >= max_duration) {
-              mediaRecorder.stop();
-              stream.getTracks().forEach((track) => track.stop());
-              return;
-            }
-
-            setIsSpeaking(isSpeaking);
-            const [mimeType, _] = mediaRecorder.mimeType.split(';');
-
-            if (audioBuffer) {
-              // If there is buffered data and the user has spoken, send the buffered data first
-              await sendAudioChunk(
-                isFirstChunk,
-                mimeType,
-                elapsedTime,
-                new Blob([audioBuffer, event.data])
-              );
-              audioBuffer = null; // Clear the buffer
-            } else {
-              await sendAudioChunk(
-                isFirstChunk,
-                mimeType,
-                elapsedTime,
-                event.data
-              );
-            }
-
-            if (isFirstChunk) {
-              isFirstChunk = false;
-            }
-          });
-
-          mediaRecorder.addEventListener('stop', async () => {
-            setIsRecording(false);
-            setIsSpeaking(false);
-
-            if (spokeAtLeastOnce && !cancelling.current) {
-              setIsRecordingFinished(true);
-              await endAudioStream(fileReferences);
-            }
-          });
-
-          const audioContext = new AudioContext();
-          const audioStreamSource =
-            audioContext.createMediaStreamSource(stream);
-          const analyser = audioContext.createAnalyser();
-          analyser.minDecibels = min_decibels;
-          audioStreamSource.connect(analyser);
-
-          const bufferLength = analyser.frequencyBinCount;
-
-          const domainData = new Uint8Array(bufferLength);
-
-          mediaRecorder.start(chunk_duration);
+const useAudio = () => {
+  const [audioConnection, setAudioConnection] =
+    useRecoilState(audioConnectionState);
+  const wavRecorder = useRecoilValue(wavRecorderState);
+  const wavStreamPlayer = useRecoilValue(wavStreamPlayerState);
+  const isAiSpeaking = useRecoilValue(isAiSpeakingState);
 
-          const detectSound = () => {
-            if (mediaRecorder.state === 'inactive') {
-              return;
-            }
-            analyser.getByteFrequencyData(domainData);
-            const soundDetected = domainData.some((value) => value > 0);
+  const { startAudioStream, endAudioStream } = useChatInteract();
 
-            if (!isSpeaking) {
-              isSpeaking = soundDetected;
-            }
-            if (!spokeAtLeastOnce && soundDetected) {
-              setIsSpeaking(isSpeaking);
-              spokeAtLeastOnce = true;
-            }
-            requestAnimationFrame(detectSound);
-          };
-          detectSound();
+  const startConversation = useCallback(async () => {
+    setAudioConnection('connecting');
+    await startAudioStream();
+  }, [startAudioStream]);
 
-          setTimeout(() => {
-            if (!spokeAtLeastOnce) {
-              mediaRecorder.stop();
-              stream.getTracks().forEach((track) => track.stop());
-            } else {
-              setTimer(
-                setInterval(() => {
-                  if (!isSpeaking) {
-                    mediaRecorder.stop();
-                    stream.getTracks().forEach((track) => track.stop());
-                  } else {
-                    isSpeaking = false;
-                  }
-                }, silence_timeout)
-              );
-            }
-          }, initial_silence_timeout);
-        })
-        .catch((err) => {
-          setError(err.message);
-        });
-    },
-    [timer, isRecording, config, sendAudioChunk, endAudioStream]
-  );
+  const endConversation = useCallback(async () => {
+    setAudioConnection('off');
+    await wavRecorder.end();
+    await wavStreamPlayer.interrupt();
+    await endAudioStream();
+  }, [endAudioStream, wavRecorder, wavStreamPlayer]);
 
   return {
-    startRecording,
-    stopRecording,
-    cancelRecording,
-    isRecording,
-    isSpeaking,
-    isRecordingFinished,
-    error
+    startConversation,
+    endConversation,
+    audioConnection,
+    isAiSpeaking,
+    wavRecorder,
+    wavStreamPlayer
   };
 };
 
diff --git a/libs/react-client/src/useChatInteract.ts b/libs/react-client/src/useChatInteract.ts
index 5aeb34be3c..ab646de970 100644
--- a/libs/react-client/src/useChatInteract.ts
+++ b/libs/react-client/src/useChatInteract.ts
@@ -90,8 +90,17 @@ const useChatInteract = () => {
     [session?.socket]
   );
 
+  const startAudioStream = useCallback(() => {
+    session?.socket.emit('audio_start');
+  }, [session?.socket]);
+
   const sendAudioChunk = useCallback(
-    (isStart: boolean, mimeType: string, elapsedTime: number, data: Blob) => {
+    (
+      isStart: boolean,
+      mimeType: string,
+      elapsedTime: number,
+      data: Int16Array
+    ) => {
       session?.socket.emit('audio_chunk', {
         isStart,
         mimeType,
@@ -102,12 +111,9 @@ const useChatInteract = () => {
     [session?.socket]
   );
 
-  const endAudioStream = useCallback(
-    (fileReferences?: IFileRef[]) => {
-      session?.socket.emit('audio_end', { fileReferences });
-    },
-    [session?.socket]
-  );
+  const endAudioStream = useCallback(() => {
+    session?.socket.emit('audio_end');
+  }, [session?.socket]);
 
   const replyMessage = useCallback(
     (message: IStep) => {
@@ -179,6 +185,7 @@ const useChatInteract = () => {
     replyMessage,
     sendMessage,
     editMessage,
+    startAudioStream,
     sendAudioChunk,
     endAudioStream,
     stopTask,
diff --git a/libs/react-client/src/useChatSession.ts b/libs/react-client/src/useChatSession.ts
index ffa817c586..fc1de3fbd5 100644
--- a/libs/react-client/src/useChatSession.ts
+++ b/libs/react-client/src/useChatSession.ts
@@ -10,6 +10,7 @@ import io from 'socket.io-client';
 import {
   actionState,
   askUserState,
+  audioConnectionState,
   callFnState,
   chatProfileState,
   chatSettingsInputsState,
@@ -17,13 +18,16 @@ import {
   currentThreadIdState,
   elementState,
   firstUserInteraction,
+  isAiSpeakingState,
   loadingState,
   messagesState,
   sessionIdState,
   sessionState,
   tasklistState,
   threadIdToResumeState,
-  tokenCountState
+  tokenCountState,
+  wavRecorderState,
+  wavStreamPlayerState
 } from 'src/state';
 import {
   IAction,
@@ -40,6 +44,8 @@ import {
   updateMessageContentById
 } from 'src/utils/message';
 
+import { OutputAudioChunk } from './types/audio';
+
 import { ChainlitContext } from './context';
 import type { IToken } from './useChatData';
 
@@ -48,10 +54,13 @@ const useChatSession = () => {
   const sessionId = useRecoilValue(sessionIdState);
 
   const [session, setSession] = useRecoilState(sessionState);
-
+  const setIsAiSpeaking = useSetRecoilState(isAiSpeakingState);
+  const setAudioConnection = useSetRecoilState(audioConnectionState);
   const resetChatSettingsValue = useResetRecoilState(chatSettingsValueState);
   const setFirstUserInteraction = useSetRecoilState(firstUserInteraction);
   const setLoading = useSetRecoilState(loadingState);
+  const wavStreamPlayer = useRecoilValue(wavStreamPlayerState);
+  const wavRecorder = useRecoilValue(wavRecorderState);
   const setMessages = useSetRecoilState(messagesState);
   const setAskUser = useSetRecoilState(askUserState);
   const setCallFn = useSetRecoilState(callFnState);
@@ -132,6 +141,41 @@ const useChatSession = () => {
         window.location.reload();
       });
 
+      socket.on('audio_connection', async (state: 'on' | 'off') => {
+        if (state === 'on') {
+          let isFirstChunk = true;
+          const startTime = Date.now();
+          const mimeType = 'pcm16';
+          // Connect to microphone
+          await wavRecorder.begin();
+          await wavStreamPlayer.connect();
+          await wavRecorder.record(async (data) => {
+            const elapsedTime = Date.now() - startTime;
+            socket.emit('audio_chunk', {
+              isStart: isFirstChunk,
+              mimeType,
+              elapsedTime,
+              data: data.mono
+            });
+            isFirstChunk = false;
+          });
+          wavStreamPlayer.onStop = () => setIsAiSpeaking(false);
+        } else {
+          await wavRecorder.end();
+          await wavStreamPlayer.interrupt();
+        }
+        setAudioConnection(state);
+      });
+
+      socket.on('audio_chunk', (chunk: OutputAudioChunk) => {
+        wavStreamPlayer.add16BitPCM(chunk.data, chunk.track);
+        setIsAiSpeaking(true);
+      });
+
+      socket.on('audio_interrupt', () => {
+        wavStreamPlayer.interrupt();
+      });
+
       socket.on('resume_thread', (thread: IThread) => {
         let messages: IStep[] = [];
         for (const step of thread.steps) {
diff --git a/libs/react-client/src/wavtools/analysis/audio_analysis.js b/libs/react-client/src/wavtools/analysis/audio_analysis.js
new file mode 100644
index 0000000000..eb4836904a
--- /dev/null
+++ b/libs/react-client/src/wavtools/analysis/audio_analysis.js
@@ -0,0 +1,203 @@
+import {
+  noteFrequencies,
+  noteFrequencyLabels,
+  voiceFrequencies,
+  voiceFrequencyLabels
+} from './constants.js';
+
+/**
+ * Output of AudioAnalysis for the frequency domain of the audio
+ * @typedef {Object} AudioAnalysisOutputType
+ * @property {Float32Array} values Amplitude of this frequency between {0, 1} inclusive
+ * @property {number[]} frequencies Raw frequency bucket values
+ * @property {string[]} labels Labels for the frequency bucket values
+ */
+
+/**
+ * Analyzes audio for visual output
+ * @class
+ */
+export class AudioAnalysis {
+  /**
+   * Retrieves frequency domain data from an AnalyserNode adjusted to a decibel range
+   * returns human-readable formatting and labels
+   * @param {AnalyserNode} analyser
+   * @param {number} sampleRate
+   * @param {Float32Array} [fftResult]
+   * @param {"frequency"|"music"|"voice"} [analysisType]
+   * @param {number} [minDecibels] default -100
+   * @param {number} [maxDecibels] default -30
+   * @returns {AudioAnalysisOutputType}
+   */
+  static getFrequencies(
+    analyser,
+    sampleRate,
+    fftResult,
+    analysisType = 'frequency',
+    minDecibels = -100,
+    maxDecibels = -30
+  ) {
+    if (!fftResult) {
+      fftResult = new Float32Array(analyser.frequencyBinCount);
+      analyser.getFloatFrequencyData(fftResult);
+    }
+    const nyquistFrequency = sampleRate / 2;
+    const frequencyStep = (1 / fftResult.length) * nyquistFrequency;
+    let outputValues;
+    let frequencies;
+    let labels;
+    if (analysisType === 'music' || analysisType === 'voice') {
+      const useFrequencies =
+        analysisType === 'voice' ? voiceFrequencies : noteFrequencies;
+      const aggregateOutput = Array(useFrequencies.length).fill(minDecibels);
+      for (let i = 0; i < fftResult.length; i++) {
+        const frequency = i * frequencyStep;
+        const amplitude = fftResult[i];
+        for (let n = useFrequencies.length - 1; n >= 0; n--) {
+          if (frequency > useFrequencies[n]) {
+            aggregateOutput[n] = Math.max(aggregateOutput[n], amplitude);
+            break;
+          }
+        }
+      }
+      outputValues = aggregateOutput;
+      frequencies =
+        analysisType === 'voice' ? voiceFrequencies : noteFrequencies;
+      labels =
+        analysisType === 'voice' ? voiceFrequencyLabels : noteFrequencyLabels;
+    } else {
+      outputValues = Array.from(fftResult);
+      frequencies = outputValues.map((_, i) => frequencyStep * i);
+      labels = frequencies.map((f) => `${f.toFixed(2)} Hz`);
+    }
+    // We normalize to {0, 1}
+    const normalizedOutput = outputValues.map((v) => {
+      return Math.max(
+        0,
+        Math.min((v - minDecibels) / (maxDecibels - minDecibels), 1)
+      );
+    });
+    const values = new Float32Array(normalizedOutput);
+    return {
+      values,
+      frequencies,
+      labels
+    };
+  }
+
+  /**
+   * Creates a new AudioAnalysis instance for an HTMLAudioElement
+   * @param {HTMLAudioElement} audioElement
+   * @param {AudioBuffer|null} [audioBuffer] If provided, will cache all frequency domain data from the buffer
+   * @returns {AudioAnalysis}
+   */
+  constructor(audioElement, audioBuffer = null) {
+    this.fftResults = [];
+    if (audioBuffer) {
+      /**
+       * Modified from
+       * https://stackoverflow.com/questions/75063715/using-the-web-audio-api-to-analyze-a-song-without-playing
+       *
+       * We do this to populate FFT values for the audio if provided an `audioBuffer`
+       * The reason to do this is that Safari fails when using `createMediaElementSource`
+       * This has a non-zero RAM cost so we only opt-in to run it on Safari, Chrome is better
+       */
+      const { length, sampleRate } = audioBuffer;
+      const offlineAudioContext = new OfflineAudioContext({
+        length,
+        sampleRate
+      });
+      const source = offlineAudioContext.createBufferSource();
+      source.buffer = audioBuffer;
+      const analyser = offlineAudioContext.createAnalyser();
+      analyser.fftSize = 8192;
+      analyser.smoothingTimeConstant = 0.1;
+      source.connect(analyser);
+      // limit is :: 128 / sampleRate;
+      // but we just want 60fps - cuts ~1s from 6MB to 1MB of RAM
+      const renderQuantumInSeconds = 1 / 60;
+      const durationInSeconds = length / sampleRate;
+      const analyze = (index) => {
+        const suspendTime = renderQuantumInSeconds * index;
+        if (suspendTime < durationInSeconds) {
+          offlineAudioContext.suspend(suspendTime).then(() => {
+            const fftResult = new Float32Array(analyser.frequencyBinCount);
+            analyser.getFloatFrequencyData(fftResult);
+            this.fftResults.push(fftResult);
+            analyze(index + 1);
+          });
+        }
+        if (index === 1) {
+          offlineAudioContext.startRendering();
+        } else {
+          offlineAudioContext.resume();
+        }
+      };
+      source.start(0);
+      analyze(1);
+      this.audio = audioElement;
+      this.context = offlineAudioContext;
+      this.analyser = analyser;
+      this.sampleRate = sampleRate;
+      this.audioBuffer = audioBuffer;
+    } else {
+      const audioContext = new AudioContext();
+      const track = audioContext.createMediaElementSource(audioElement);
+      const analyser = audioContext.createAnalyser();
+      analyser.fftSize = 8192;
+      analyser.smoothingTimeConstant = 0.1;
+      track.connect(analyser);
+      analyser.connect(audioContext.destination);
+      this.audio = audioElement;
+      this.context = audioContext;
+      this.analyser = analyser;
+      this.sampleRate = this.context.sampleRate;
+      this.audioBuffer = null;
+    }
+  }
+
+  /**
+   * Gets the current frequency domain data from the playing audio track
+   * @param {"frequency"|"music"|"voice"} [analysisType]
+   * @param {number} [minDecibels] default -100
+   * @param {number} [maxDecibels] default -30
+   * @returns {AudioAnalysisOutputType}
+   */
+  getFrequencies(
+    analysisType = 'frequency',
+    minDecibels = -100,
+    maxDecibels = -30
+  ) {
+    let fftResult = null;
+    if (this.audioBuffer && this.fftResults.length) {
+      const pct = this.audio.currentTime / this.audio.duration;
+      const index = Math.min(
+        (pct * this.fftResults.length) | 0,
+        this.fftResults.length - 1
+      );
+      fftResult = this.fftResults[index];
+    }
+    return AudioAnalysis.getFrequencies(
+      this.analyser,
+      this.sampleRate,
+      fftResult,
+      analysisType,
+      minDecibels,
+      maxDecibels
+    );
+  }
+
+  /**
+   * Resume the internal AudioContext if it was suspended due to the lack of
+   * user interaction when the AudioAnalysis was instantiated.
+   * @returns {Promise<true>}
+   */
+  async resumeIfSuspended() {
+    if (this.context.state === 'suspended') {
+      await this.context.resume();
+    }
+    return true;
+  }
+}
+
+globalThis.AudioAnalysis = AudioAnalysis;
diff --git a/libs/react-client/src/wavtools/analysis/constants.js b/libs/react-client/src/wavtools/analysis/constants.js
new file mode 100644
index 0000000000..4c57e557de
--- /dev/null
+++ b/libs/react-client/src/wavtools/analysis/constants.js
@@ -0,0 +1,60 @@
+/**
+ * Constants for help with visualization
+ * Helps map frequency ranges from Fast Fourier Transform
+ * to human-interpretable ranges, notably music ranges and
+ * human vocal ranges.
+ */
+
+// Eighth octave frequencies
+const octave8Frequencies = [
+  4186.01, 4434.92, 4698.63, 4978.03, 5274.04, 5587.65, 5919.91, 6271.93,
+  6644.88, 7040.0, 7458.62, 7902.13
+];
+
+// Labels for each of the above frequencies
+const octave8FrequencyLabels = [
+  'C',
+  'C#',
+  'D',
+  'D#',
+  'E',
+  'F',
+  'F#',
+  'G',
+  'G#',
+  'A',
+  'A#',
+  'B'
+];
+
+/**
+ * All note frequencies from 1st to 8th octave
+ * in format "A#8" (A#, 8th octave)
+ */
+export const noteFrequencies = [];
+export const noteFrequencyLabels = [];
+for (let i = 1; i <= 8; i++) {
+  for (let f = 0; f < octave8Frequencies.length; f++) {
+    const freq = octave8Frequencies[f];
+    noteFrequencies.push(freq / Math.pow(2, 8 - i));
+    noteFrequencyLabels.push(octave8FrequencyLabels[f] + i);
+  }
+}
+
+/**
+ * Subset of the note frequencies between 32 and 2000 Hz
+ * 6 octave range: C1 to B6
+ */
+const voiceFrequencyRange = [32.0, 2000.0];
+export const voiceFrequencies = noteFrequencies.filter((_, i) => {
+  return (
+    noteFrequencies[i] > voiceFrequencyRange[0] &&
+    noteFrequencies[i] < voiceFrequencyRange[1]
+  );
+});
+export const voiceFrequencyLabels = noteFrequencyLabels.filter((_, i) => {
+  return (
+    noteFrequencies[i] > voiceFrequencyRange[0] &&
+    noteFrequencies[i] < voiceFrequencyRange[1]
+  );
+});
diff --git a/libs/react-client/src/wavtools/index.ts b/libs/react-client/src/wavtools/index.ts
new file mode 100644
index 0000000000..62bd593449
--- /dev/null
+++ b/libs/react-client/src/wavtools/index.ts
@@ -0,0 +1,7 @@
+// Courtesy of https://github.com/openai/openai-realtime-console
+import { AudioAnalysis } from './analysis/audio_analysis.js';
+import { WavPacker } from './wav_packer.js';
+import { WavRecorder } from './wav_recorder.js';
+import { WavStreamPlayer } from './wav_stream_player.js';
+
+export { AudioAnalysis, WavPacker, WavStreamPlayer, WavRecorder };
diff --git a/libs/react-client/src/wavtools/wav_packer.js b/libs/react-client/src/wavtools/wav_packer.js
new file mode 100644
index 0000000000..1e11780bbc
--- /dev/null
+++ b/libs/react-client/src/wavtools/wav_packer.js
@@ -0,0 +1,113 @@
+/**
+ * Raw wav audio file contents
+ * @typedef {Object} WavPackerAudioType
+ * @property {Blob} blob
+ * @property {string} url
+ * @property {number} channelCount
+ * @property {number} sampleRate
+ * @property {number} duration
+ */
+
+/**
+ * Utility class for assembling PCM16 "audio/wav" data
+ * @class
+ */
+export class WavPacker {
+  /**
+   * Converts Float32Array of amplitude data to ArrayBuffer in Int16Array format
+   * @param {Float32Array} float32Array
+   * @returns {ArrayBuffer}
+   */
+  static floatTo16BitPCM(float32Array) {
+    const buffer = new ArrayBuffer(float32Array.length * 2);
+    const view = new DataView(buffer);
+    let offset = 0;
+    for (let i = 0; i < float32Array.length; i++, offset += 2) {
+      let s = Math.max(-1, Math.min(1, float32Array[i]));
+      view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+    }
+    return buffer;
+  }
+
+  /**
+   * Concatenates two ArrayBuffers
+   * @param {ArrayBuffer} leftBuffer
+   * @param {ArrayBuffer} rightBuffer
+   * @returns {ArrayBuffer}
+   */
+  static mergeBuffers(leftBuffer, rightBuffer) {
+    const tmpArray = new Uint8Array(
+      leftBuffer.byteLength + rightBuffer.byteLength
+    );
+    tmpArray.set(new Uint8Array(leftBuffer), 0);
+    tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength);
+    return tmpArray.buffer;
+  }
+
+  /**
+   * Packs data into an Int16 format
+   * @private
+   * @param {number} size 0 = 1x Int16, 1 = 2x Int16
+   * @param {number} arg value to pack
+   * @returns
+   */
+  _packData(size, arg) {
+    return [
+      new Uint8Array([arg, arg >> 8]),
+      new Uint8Array([arg, arg >> 8, arg >> 16, arg >> 24])
+    ][size];
+  }
+
+  /**
+   * Packs audio into "audio/wav" Blob
+   * @param {number} sampleRate
+   * @param {{bitsPerSample: number, channels: Array<Float32Array>, data: Int16Array}} audio
+   * @returns {WavPackerAudioType}
+   */
+  pack(sampleRate, audio) {
+    if (!audio?.bitsPerSample) {
+      throw new Error(`Missing "bitsPerSample"`);
+    } else if (!audio?.channels) {
+      throw new Error(`Missing "channels"`);
+    } else if (!audio?.data) {
+      throw new Error(`Missing "data"`);
+    }
+    const { bitsPerSample, channels, data } = audio;
+    const output = [
+      // Header
+      'RIFF',
+      this._packData(
+        1,
+        4 + (8 + 24) /* chunk 1 length */ + (8 + 8) /* chunk 2 length */
+      ), // Length
+      'WAVE',
+      // chunk 1
+      'fmt ', // Sub-chunk identifier
+      this._packData(1, 16), // Chunk length
+      this._packData(0, 1), // Audio format (1 is linear quantization)
+      this._packData(0, channels.length),
+      this._packData(1, sampleRate),
+      this._packData(1, (sampleRate * channels.length * bitsPerSample) / 8), // Byte rate
+      this._packData(0, (channels.length * bitsPerSample) / 8),
+      this._packData(0, bitsPerSample),
+      // chunk 2
+      'data', // Sub-chunk identifier
+      this._packData(
+        1,
+        (channels[0].length * channels.length * bitsPerSample) / 8
+      ), // Chunk length
+      data
+    ];
+    const blob = new Blob(output, { type: 'audio/mpeg' });
+    const url = URL.createObjectURL(blob);
+    return {
+      blob,
+      url,
+      channelCount: channels.length,
+      sampleRate,
+      duration: data.byteLength / (channels.length * sampleRate * 2)
+    };
+  }
+}
+
+globalThis.WavPacker = WavPacker;
diff --git a/libs/react-client/src/wavtools/wav_recorder.js b/libs/react-client/src/wavtools/wav_recorder.js
new file mode 100644
index 0000000000..9b36a205d0
--- /dev/null
+++ b/libs/react-client/src/wavtools/wav_recorder.js
@@ -0,0 +1,548 @@
+import { AudioAnalysis } from './analysis/audio_analysis.js';
+import { WavPacker } from './wav_packer.js';
+import { AudioProcessorSrc } from './worklets/audio_processor.js';
+
+/**
+ * Decodes audio into a wav file
+ * @typedef {Object} DecodedAudioType
+ * @property {Blob} blob
+ * @property {string} url
+ * @property {Float32Array} values
+ * @property {AudioBuffer} audioBuffer
+ */
+
+/**
+ * Records live stream of user audio as PCM16 "audio/wav" data
+ * @class
+ */
+export class WavRecorder {
+  /**
+   * Create a new WavRecorder instance
+   * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options]
+   * @returns {WavRecorder}
+   */
+  constructor({
+    sampleRate = 24000,
+    outputToSpeakers = false,
+    debug = false
+  } = {}) {
+    // Script source
+    this.scriptSrc = AudioProcessorSrc;
+    // Config
+    this.sampleRate = sampleRate;
+    this.outputToSpeakers = outputToSpeakers;
+    this.debug = !!debug;
+    this._deviceChangeCallback = null;
+    this._devices = [];
+    // State variables
+    this.stream = null;
+    this.processor = null;
+    this.source = null;
+    this.node = null;
+    this.recording = false;
+    // Event handling with AudioWorklet
+    this._lastEventId = 0;
+    this.eventReceipts = {};
+    this.eventTimeout = 5000;
+    // Process chunks of audio
+    this._chunkProcessor = () => {};
+    this._chunkProcessorSize = void 0;
+    this._chunkProcessorBuffer = {
+      raw: new ArrayBuffer(0),
+      mono: new ArrayBuffer(0)
+    };
+  }
+
+  /**
+   * Decodes audio data from multiple formats to a Blob, url, Float32Array and AudioBuffer
+   * @param {Blob|Float32Array|Int16Array|ArrayBuffer|number[]} audioData
+   * @param {number} sampleRate
+   * @param {number} fromSampleRate
+   * @returns {Promise<DecodedAudioType>}
+   */
+  static async decode(audioData, sampleRate = 24000, fromSampleRate = -1) {
+    const context = new AudioContext({ sampleRate });
+    let arrayBuffer;
+    let blob;
+    if (audioData instanceof Blob) {
+      if (fromSampleRate !== -1) {
+        throw new Error(
+          `Can not specify "fromSampleRate" when reading from Blob`
+        );
+      }
+      blob = audioData;
+      arrayBuffer = await blob.arrayBuffer();
+    } else if (audioData instanceof ArrayBuffer) {
+      if (fromSampleRate !== -1) {
+        throw new Error(
+          `Can not specify "fromSampleRate" when reading from ArrayBuffer`
+        );
+      }
+      arrayBuffer = audioData;
+      blob = new Blob([arrayBuffer], { type: 'audio/wav' });
+    } else {
+      let float32Array;
+      let data;
+      if (audioData instanceof Int16Array) {
+        data = audioData;
+        float32Array = new Float32Array(audioData.length);
+        for (let i = 0; i < audioData.length; i++) {
+          float32Array[i] = audioData[i] / 0x8000;
+        }
+      } else if (audioData instanceof Float32Array) {
+        float32Array = audioData;
+      } else if (audioData instanceof Array) {
+        float32Array = new Float32Array(audioData);
+      } else {
+        throw new Error(
+          `"audioData" must be one of: Blob, Float32Arrray, Int16Array, ArrayBuffer, Array<number>`
+        );
+      }
+      if (fromSampleRate === -1) {
+        throw new Error(
+          `Must specify "fromSampleRate" when reading from Float32Array, In16Array or Array`
+        );
+      } else if (fromSampleRate < 3000) {
+        throw new Error(`Minimum "fromSampleRate" is 3000 (3kHz)`);
+      }
+      if (!data) {
+        data = WavPacker.floatTo16BitPCM(float32Array);
+      }
+      const audio = {
+        bitsPerSample: 16,
+        channels: [float32Array],
+        data
+      };
+      const packer = new WavPacker();
+      const result = packer.pack(fromSampleRate, audio);
+      blob = result.blob;
+      arrayBuffer = await blob.arrayBuffer();
+    }
+    const audioBuffer = await context.decodeAudioData(arrayBuffer);
+    const values = audioBuffer.getChannelData(0);
+    const url = URL.createObjectURL(blob);
+    return {
+      blob,
+      url,
+      values,
+      audioBuffer
+    };
+  }
+
+  /**
+   * Logs data in debug mode
+   * @param {...any} arguments
+   * @returns {true}
+   */
+  log() {
+    if (this.debug) {
+      this.log(...arguments);
+    }
+    return true;
+  }
+
+  /**
+   * Retrieves the current sampleRate for the recorder
+   * @returns {number}
+   */
+  getSampleRate() {
+    return this.sampleRate;
+  }
+
+  /**
+   * Retrieves the current status of the recording
+   * @returns {"ended"|"paused"|"recording"}
+   */
+  getStatus() {
+    if (!this.processor) {
+      return 'ended';
+    } else if (!this.recording) {
+      return 'paused';
+    } else {
+      return 'recording';
+    }
+  }
+
+  /**
+   * Sends an event to the AudioWorklet
+   * @private
+   * @param {string} name
+   * @param {{[key: string]: any}} data
+   * @param {AudioWorkletNode} [_processor]
+   * @returns {Promise<{[key: string]: any}>}
+   */
+  async _event(name, data = {}, _processor = null) {
+    _processor = _processor || this.processor;
+    if (!_processor) {
+      throw new Error('Can not send events without recording first');
+    }
+    const message = {
+      event: name,
+      id: this._lastEventId++,
+      data
+    };
+    _processor.port.postMessage(message);
+    const t0 = new Date().valueOf();
+    while (!this.eventReceipts[message.id]) {
+      if (new Date().valueOf() - t0 > this.eventTimeout) {
+        throw new Error(`Timeout waiting for "${name}" event`);
+      }
+      await new Promise((res) => setTimeout(() => res(true), 1));
+    }
+    const payload = this.eventReceipts[message.id];
+    delete this.eventReceipts[message.id];
+    return payload;
+  }
+
+  /**
+   * Sets device change callback, remove if callback provided is `null`
+   * @param {(Array<MediaDeviceInfo & {default: boolean}>): void|null} callback
+   * @returns {true}
+   */
+  listenForDeviceChange(callback) {
+    if (callback === null && this._deviceChangeCallback) {
+      navigator.mediaDevices.removeEventListener(
+        'devicechange',
+        this._deviceChangeCallback
+      );
+      this._deviceChangeCallback = null;
+    } else if (callback !== null) {
+      // Basically a debounce; we only want this called once when devices change
+      // And we only want the most recent callback() to be executed
+      // if a few are operating at the same time
+      let lastId = 0;
+      let lastDevices = [];
+      const serializeDevices = (devices) =>
+        devices
+          .map((d) => d.deviceId)
+          .sort()
+          .join(',');
+      const cb = async () => {
+        let id = ++lastId;
+        const devices = await this.listDevices();
+        if (id === lastId) {
+          if (serializeDevices(lastDevices) !== serializeDevices(devices)) {
+            lastDevices = devices;
+            callback(devices.slice());
+          }
+        }
+      };
+      navigator.mediaDevices.addEventListener('devicechange', cb);
+      cb();
+      this._deviceChangeCallback = cb;
+    }
+    return true;
+  }
+
+  /**
+   * Manually request permission to use the microphone
+   * @returns {Promise<true>}
+   */
+  async requestPermission() {
+    const permissionStatus = await navigator.permissions.query({
+      name: 'microphone'
+    });
+    if (permissionStatus.state === 'denied') {
+      window.alert('You must grant microphone access to use this feature.');
+    } else if (permissionStatus.state === 'prompt') {
+      try {
+        const stream = await navigator.mediaDevices.getUserMedia({
+          audio: true
+        });
+        const tracks = stream.getTracks();
+        tracks.forEach((track) => track.stop());
+      } catch (e) {
+        window.alert('You must grant microphone access to use this feature.');
+      }
+    }
+    return true;
+  }
+
+  /**
+   * List all eligible devices for recording, will request permission to use microphone
+   * @returns {Promise<Array<MediaDeviceInfo & {default: boolean}>>}
+   */
+  async listDevices() {
+    if (
+      !navigator.mediaDevices ||
+      !('enumerateDevices' in navigator.mediaDevices)
+    ) {
+      throw new Error('Could not request user devices');
+    }
+    await this.requestPermission();
+    const devices = await navigator.mediaDevices.enumerateDevices();
+    const audioDevices = devices.filter(
+      (device) => device.kind === 'audioinput'
+    );
+    const defaultDeviceIndex = audioDevices.findIndex(
+      (device) => device.deviceId === 'default'
+    );
+    const deviceList = [];
+    if (defaultDeviceIndex !== -1) {
+      let defaultDevice = audioDevices.splice(defaultDeviceIndex, 1)[0];
+      let existingIndex = audioDevices.findIndex(
+        (device) => device.groupId === defaultDevice.groupId
+      );
+      if (existingIndex !== -1) {
+        defaultDevice = audioDevices.splice(existingIndex, 1)[0];
+      }
+      defaultDevice.default = true;
+      deviceList.push(defaultDevice);
+    }
+    return deviceList.concat(audioDevices);
+  }
+
+  /**
+   * Begins a recording session and requests microphone permissions if not already granted
+   * Microphone recording indicator will appear on browser tab but status will be "paused"
+   * @param {string} [deviceId] if no device provided, default device will be used
+   * @returns {Promise<true>}
+   */
+  async begin(deviceId) {
+    if (this.processor) {
+      throw new Error(
+        `Already connected: please call .end() to start a new session`
+      );
+    }
+
+    if (
+      !navigator.mediaDevices ||
+      !('getUserMedia' in navigator.mediaDevices)
+    ) {
+      throw new Error('Could not request user media');
+    }
+    try {
+      const config = { audio: true };
+      if (deviceId) {
+        config.audio = { deviceId: { exact: deviceId } };
+      }
+      this.stream = await navigator.mediaDevices.getUserMedia(config);
+    } catch (err) {
+      throw new Error('Could not start media stream');
+    }
+
+    const context = new AudioContext({ sampleRate: this.sampleRate });
+    const source = context.createMediaStreamSource(this.stream);
+    // Load and execute the module script.
+    try {
+      await context.audioWorklet.addModule(this.scriptSrc);
+    } catch (e) {
+      console.error(e);
+      throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`);
+    }
+    const processor = new AudioWorkletNode(context, 'audio_processor');
+    processor.port.onmessage = (e) => {
+      const { event, id, data } = e.data;
+      if (event === 'receipt') {
+        this.eventReceipts[id] = data;
+      } else if (event === 'chunk') {
+        if (this._chunkProcessorSize) {
+          const buffer = this._chunkProcessorBuffer;
+          this._chunkProcessorBuffer = {
+            raw: WavPacker.mergeBuffers(buffer.raw, data.raw),
+            mono: WavPacker.mergeBuffers(buffer.mono, data.mono)
+          };
+          if (
+            this._chunkProcessorBuffer.mono.byteLength >=
+            this._chunkProcessorSize
+          ) {
+            this._chunkProcessor(this._chunkProcessorBuffer);
+            this._chunkProcessorBuffer = {
+              raw: new ArrayBuffer(0),
+              mono: new ArrayBuffer(0)
+            };
+          }
+        } else {
+          this._chunkProcessor(data);
+        }
+      }
+    };
+
+    const node = source.connect(processor);
+    const analyser = context.createAnalyser();
+    analyser.fftSize = 8192;
+    analyser.smoothingTimeConstant = 0.1;
+    node.connect(analyser);
+    if (this.outputToSpeakers) {
+      // eslint-disable-next-line no-console
+      console.warn(
+        'Warning: Output to speakers may affect sound quality,\n' +
+          'especially due to system audio feedback preventative measures.\n' +
+          'use only for debugging'
+      );
+      analyser.connect(context.destination);
+    }
+
+    this.source = source;
+    this.node = node;
+    this.analyser = analyser;
+    this.processor = processor;
+    return true;
+  }
+
+  /**
+   * Gets the current frequency domain data from the recording track
+   * @param {"frequency"|"music"|"voice"} [analysisType]
+   * @param {number} [minDecibels] default -100
+   * @param {number} [maxDecibels] default -30
+   * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType}
+   */
+  getFrequencies(
+    analysisType = 'frequency',
+    minDecibels = -100,
+    maxDecibels = -30
+  ) {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    }
+    return AudioAnalysis.getFrequencies(
+      this.analyser,
+      this.sampleRate,
+      null,
+      analysisType,
+      minDecibels,
+      maxDecibels
+    );
+  }
+
+  /**
+   * Pauses the recording
+   * Keeps microphone stream open but halts storage of audio
+   * @returns {Promise<true>}
+   */
+  async pause() {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    } else if (!this.recording) {
+      throw new Error('Already paused: please call .record() first');
+    }
+    if (this._chunkProcessorBuffer.raw.byteLength) {
+      this._chunkProcessor(this._chunkProcessorBuffer);
+    }
+    this.log('Pausing ...');
+    await this._event('stop');
+    this.recording = false;
+    return true;
+  }
+
+  /**
+   * Start recording stream and storing to memory from the connected audio source
+   * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor]
+   * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio
+   * @returns {Promise<true>}
+   */
+  async record(chunkProcessor = () => {}, chunkSize = 8192) {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    } else if (this.recording) {
+      throw new Error('Already recording: please call .pause() first');
+    } else if (typeof chunkProcessor !== 'function') {
+      throw new Error(`chunkProcessor must be a function`);
+    }
+    this._chunkProcessor = chunkProcessor;
+    this._chunkProcessorSize = chunkSize;
+    this._chunkProcessorBuffer = {
+      raw: new ArrayBuffer(0),
+      mono: new ArrayBuffer(0)
+    };
+    this.log('Recording ...');
+    await this._event('start');
+    this.recording = true;
+    return true;
+  }
+
+  /**
+   * Clears the audio buffer, empties stored recording
+   * @returns {Promise<true>}
+   */
+  async clear() {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    }
+    await this._event('clear');
+    return true;
+  }
+
+  /**
+   * Reads the current audio stream data
+   * @returns {Promise<{meanValues: Float32Array, channels: Array<Float32Array>}>}
+   */
+  async read() {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    }
+    this.log('Reading ...');
+    const result = await this._event('read');
+    return result;
+  }
+
+  /**
+   * Saves the current audio stream to a file
+   * @param {boolean} [force] Force saving while still recording
+   * @returns {Promise<import('./wav_packer.js').WavPackerAudioType>}
+   */
+  async save(force = false) {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    }
+    if (!force && this.recording) {
+      throw new Error(
+        'Currently recording: please call .pause() first, or call .save(true) to force'
+      );
+    }
+    this.log('Exporting ...');
+    const exportData = await this._event('export');
+    const packer = new WavPacker();
+    const result = packer.pack(this.sampleRate, exportData.audio);
+    return result;
+  }
+
+  /**
+   * Ends the current recording session and saves the result
+   * @returns {Promise<import('./wav_packer.js').WavPackerAudioType>}
+   */
+  async end() {
+    if (!this.processor) {
+      throw new Error('Session ended: please call .begin() first');
+    }
+
+    const _processor = this.processor;
+
+    this.log('Stopping ...');
+    await this._event('stop');
+    this.recording = false;
+    const tracks = this.stream.getTracks();
+    tracks.forEach((track) => track.stop());
+
+    this.log('Exporting ...');
+    const exportData = await this._event('export', {}, _processor);
+
+    this.processor.disconnect();
+    this.source.disconnect();
+    this.node.disconnect();
+    this.analyser.disconnect();
+    this.stream = null;
+    this.processor = null;
+    this.source = null;
+    this.node = null;
+
+    const packer = new WavPacker();
+    const result = packer.pack(this.sampleRate, exportData.audio);
+    return result;
+  }
+
+  /**
+   * Performs a full cleanup of WavRecorder instance
+   * Stops actively listening via microphone and removes existing listeners
+   * @returns {Promise<true>}
+   */
+  async quit() {
+    this.listenForDeviceChange(null);
+    if (this.processor) {
+      await this.end();
+    }
+    return true;
+  }
+}
+
+globalThis.WavRecorder = WavRecorder;
diff --git a/libs/react-client/src/wavtools/wav_renderer.ts b/libs/react-client/src/wavtools/wav_renderer.ts
new file mode 100644
index 0000000000..5e556db325
--- /dev/null
+++ b/libs/react-client/src/wavtools/wav_renderer.ts
@@ -0,0 +1,132 @@
+const dataMap = new WeakMap();
+
+/**
+ * Normalizes a Float32Array to Array(m): We use this to draw amplitudes on a graph
+ * If we're rendering the same audio data, then we'll often be using
+ * the same (data, m, downsamplePeaks) triplets so we give option to memoize
+ */
+const normalizeArray = (
+  data: Float32Array,
+  m: number,
+  downsamplePeaks: boolean = false,
+  memoize: boolean = false
+) => {
+  let cache, mKey, dKey;
+  if (memoize) {
+    mKey = m.toString();
+    dKey = downsamplePeaks.toString();
+    cache = dataMap.has(data) ? dataMap.get(data) : {};
+    dataMap.set(data, cache);
+    cache[mKey] = cache[mKey] || {};
+    if (cache[mKey][dKey]) {
+      return cache[mKey][dKey];
+    }
+  }
+  const n = data.length;
+  const result = new Array(m);
+  if (m <= n) {
+    // Downsampling
+    result.fill(0);
+    const count = new Array(m).fill(0);
+    for (let i = 0; i < n; i++) {
+      const index = Math.floor(i * (m / n));
+      if (downsamplePeaks) {
+        // take highest result in the set
+        result[index] = Math.max(result[index], Math.abs(data[i]));
+      } else {
+        result[index] += Math.abs(data[i]);
+      }
+      count[index]++;
+    }
+    if (!downsamplePeaks) {
+      for (let i = 0; i < result.length; i++) {
+        result[i] = result[i] / count[i];
+      }
+    }
+  } else {
+    for (let i = 0; i < m; i++) {
+      const index = (i * (n - 1)) / (m - 1);
+      const low = Math.floor(index);
+      const high = Math.ceil(index);
+      const t = index - low;
+      if (high >= n) {
+        result[i] = data[n - 1];
+      } else {
+        result[i] = data[low] * (1 - t) + data[high] * t;
+      }
+    }
+  }
+  if (memoize) {
+    cache[mKey as string][dKey as string] = result;
+  }
+  return result;
+};
+
+export const WavRenderer = {
+  /**
+   * Renders a point-in-time snapshot of an audio sample, usually frequency values
+   * @param ctx
+   * @param data
+   * @param color
+   * @param cssWidth
+   * @param cssHeight
+   * @param pointCount number of bars to render
+   * @param barWidth width of bars in px
+   * @param barSpacing spacing between bars in px
+   * @param center vertically center the bars
+   */
+  drawBars: (
+    ctx: CanvasRenderingContext2D,
+    data: Float32Array,
+    cssWidth: number,
+    cssHeight: number,
+    color: string,
+    pointCount: number = 0,
+    barWidth: number = 0,
+    barSpacing: number = 0,
+    center: boolean = false
+  ) => {
+    pointCount = Math.floor(
+      Math.min(
+        pointCount,
+        (cssWidth - barSpacing) / (Math.max(barWidth, 1) + barSpacing)
+      )
+    );
+    if (!pointCount) {
+      pointCount = Math.floor(
+        (cssWidth - barSpacing) / (Math.max(barWidth, 1) + barSpacing)
+      );
+    }
+    if (!barWidth) {
+      barWidth = (cssWidth - barSpacing) / pointCount - barSpacing;
+    }
+    const points = normalizeArray(data, pointCount, true);
+    for (let i = 0; i < pointCount; i++) {
+      const amplitude = Math.abs(points[i]);
+      const height = Math.max(1, amplitude * cssHeight);
+      const x = barSpacing + i * (barWidth + barSpacing);
+      const y = center ? (cssHeight - height) / 2 : cssHeight - height;
+      const radius = Math.min(barWidth / 2, height / 2); // Calculate the radius for rounded corners
+
+      ctx.fillStyle = color;
+      ctx.beginPath();
+      ctx.moveTo(x + radius, y);
+      ctx.lineTo(x + barWidth - radius, y);
+      ctx.arcTo(x + barWidth, y, x + barWidth, y + radius, radius);
+      ctx.lineTo(x + barWidth, y + height - radius);
+      ctx.arcTo(
+        x + barWidth,
+        y + height,
+        x + barWidth - radius,
+        y + height,
+        radius
+      );
+      ctx.lineTo(x + radius, y + height);
+      ctx.arcTo(x, y + height, x, y + height - radius, radius);
+      ctx.lineTo(x, y + radius);
+      ctx.arcTo(x, y, x + radius, y, radius);
+      ctx.closePath();
+      ctx.fill();
+    }
+  }
+};
diff --git a/libs/react-client/src/wavtools/wav_stream_player.js b/libs/react-client/src/wavtools/wav_stream_player.js
new file mode 100644
index 0000000000..6cbd061539
--- /dev/null
+++ b/libs/react-client/src/wavtools/wav_stream_player.js
@@ -0,0 +1,162 @@
+import { AudioAnalysis } from './analysis/audio_analysis.js';
+import { StreamProcessorSrc } from './worklets/stream_processor.js';
+
+/**
+ * Plays audio streams received in raw PCM16 chunks from the browser
+ * @class
+ */
+export class WavStreamPlayer {
+  /**
+   * Creates a new WavStreamPlayer instance
+   * @param {{sampleRate?: number}} options
+   * @returns {WavStreamPlayer}
+   */
+  constructor({ sampleRate = 24000, onStop } = {}) {
+    this.scriptSrc = StreamProcessorSrc;
+    this.onStop = onStop;
+    this.sampleRate = sampleRate;
+    this.context = null;
+    this.stream = null;
+    this.analyser = null;
+    this.trackSampleOffsets = {};
+    this.interruptedTrackIds = {};
+  }
+
+  /**
+   * Connects the audio context and enables output to speakers
+   * @returns {Promise<true>}
+   */
+  async connect() {
+    this.context = new AudioContext({ sampleRate: this.sampleRate });
+    if (this.context.state === 'suspended') {
+      await this.context.resume();
+    }
+    try {
+      await this.context.audioWorklet.addModule(this.scriptSrc);
+    } catch (e) {
+      console.error(e);
+      throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`);
+    }
+    const analyser = this.context.createAnalyser();
+    analyser.fftSize = 8192;
+    analyser.smoothingTimeConstant = 0.1;
+    this.analyser = analyser;
+    return true;
+  }
+
+  /**
+   * Gets the current frequency domain data from the playing track
+   * @param {"frequency"|"music"|"voice"} [analysisType]
+   * @param {number} [minDecibels] default -100
+   * @param {number} [maxDecibels] default -30
+   * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType}
+   */
+  getFrequencies(
+    analysisType = 'frequency',
+    minDecibels = -100,
+    maxDecibels = -30
+  ) {
+    if (!this.analyser) {
+      throw new Error('Not connected, please call .connect() first');
+    }
+    return AudioAnalysis.getFrequencies(
+      this.analyser,
+      this.sampleRate,
+      null,
+      analysisType,
+      minDecibels,
+      maxDecibels
+    );
+  }
+
+  /**
+   * Starts audio streaming
+   * @private
+   * @returns {Promise<true>}
+   */
+  _start() {
+    const streamNode = new AudioWorkletNode(this.context, 'stream_processor');
+    streamNode.connect(this.context.destination);
+    streamNode.port.onmessage = (e) => {
+      const { event } = e.data;
+      if (event === 'stop') {
+        this.onStop?.();
+        streamNode.disconnect();
+        this.stream = null;
+      } else if (event === 'offset') {
+        const { requestId, trackId, offset } = e.data;
+        const currentTime = offset / this.sampleRate;
+        this.trackSampleOffsets[requestId] = { trackId, offset, currentTime };
+      }
+    };
+    this.analyser.disconnect();
+    streamNode.connect(this.analyser);
+    this.stream = streamNode;
+    return true;
+  }
+
+  /**
+   * Adds 16BitPCM data to the currently playing audio stream
+   * You can add chunks beyond the current play point and they will be queued for play
+   * @param {ArrayBuffer|Int16Array} arrayBuffer
+   * @param {string} [trackId]
+   * @returns {Int16Array}
+   */
+  add16BitPCM(arrayBuffer, trackId = 'default') {
+    if (typeof trackId !== 'string') {
+      throw new Error(`trackId must be a string`);
+    } else if (this.interruptedTrackIds[trackId]) {
+      return;
+    }
+    if (!this.stream) {
+      this._start();
+    }
+    let buffer;
+    if (arrayBuffer instanceof Int16Array) {
+      buffer = arrayBuffer;
+    } else if (arrayBuffer instanceof ArrayBuffer) {
+      buffer = new Int16Array(arrayBuffer);
+    } else {
+      throw new Error(`argument must be Int16Array or ArrayBuffer`);
+    }
+    this.stream.port.postMessage({ event: 'write', buffer, trackId });
+    return buffer;
+  }
+
+  /**
+   * Gets the offset (sample count) of the currently playing stream
+   * @param {boolean} [interrupt]
+   * @returns {{trackId: string|null, offset: number, currentTime: number}}
+   */
+  async getTrackSampleOffset(interrupt = false) {
+    if (!this.stream) {
+      return null;
+    }
+    const requestId = crypto.randomUUID();
+    this.stream.port.postMessage({
+      event: interrupt ? 'interrupt' : 'offset',
+      requestId
+    });
+    let trackSampleOffset;
+    while (!trackSampleOffset) {
+      trackSampleOffset = this.trackSampleOffsets[requestId];
+      await new Promise((r) => setTimeout(() => r(), 1));
+    }
+    const { trackId } = trackSampleOffset;
+    if (interrupt && trackId) {
+      this.interruptedTrackIds[trackId] = true;
+    }
+    return trackSampleOffset;
+  }
+
+  /**
+   * Strips the current stream and returns the sample offset of the audio
+   * @param {boolean} [interrupt]
+   * @returns {{trackId: string|null, offset: number, currentTime: number}}
+   */
+  async interrupt() {
+    return this.getTrackSampleOffset(true);
+  }
+}
+
+globalThis.WavStreamPlayer = WavStreamPlayer;
diff --git a/libs/react-client/src/wavtools/worklets/audio_processor.js b/libs/react-client/src/wavtools/worklets/audio_processor.js
new file mode 100644
index 0000000000..66ce005841
--- /dev/null
+++ b/libs/react-client/src/wavtools/worklets/audio_processor.js
@@ -0,0 +1,214 @@
+const AudioProcessorWorklet = `
+class AudioProcessor extends AudioWorkletProcessor {
+
+  constructor() {
+    super();
+    this.port.onmessage = this.receive.bind(this);
+    this.initialize();
+  }
+
+  initialize() {
+    this.foundAudio = false;
+    this.recording = false;
+    this.chunks = [];
+  }
+
+  /**
+   * Concatenates sampled chunks into channels
+   * Format is chunk[Left[], Right[]]
+   */
+  readChannelData(chunks, channel = -1, maxChannels = 9) {
+    let channelLimit;
+    if (channel !== -1) {
+      if (chunks[0] && chunks[0].length - 1 < channel) {
+        throw new Error(
+          \`Channel \${channel} out of range: max \${chunks[0].length}\`
+        );
+      }
+      channelLimit = channel + 1;
+    } else {
+      channel = 0;
+      channelLimit = Math.min(chunks[0] ? chunks[0].length : 1, maxChannels);
+    }
+    const channels = [];
+    for (let n = channel; n < channelLimit; n++) {
+      const length = chunks.reduce((sum, chunk) => {
+        return sum + chunk[n].length;
+      }, 0);
+      const buffers = chunks.map((chunk) => chunk[n]);
+      const result = new Float32Array(length);
+      let offset = 0;
+      for (let i = 0; i < buffers.length; i++) {
+        result.set(buffers[i], offset);
+        offset += buffers[i].length;
+      }
+      channels[n] = result;
+    }
+    return channels;
+  }
+
+  /**
+   * Combines parallel audio data into correct format,
+   * channels[Left[], Right[]] to float32Array[LRLRLRLR...]
+   */
+  formatAudioData(channels) {
+    if (channels.length === 1) {
+      // Simple case is only one channel
+      const float32Array = channels[0].slice();
+      const meanValues = channels[0].slice();
+      return { float32Array, meanValues };
+    } else {
+      const float32Array = new Float32Array(
+        channels[0].length * channels.length
+      );
+      const meanValues = new Float32Array(channels[0].length);
+      for (let i = 0; i < channels[0].length; i++) {
+        const offset = i * channels.length;
+        let meanValue = 0;
+        for (let n = 0; n < channels.length; n++) {
+          float32Array[offset + n] = channels[n][i];
+          meanValue += channels[n][i];
+        }
+        meanValues[i] = meanValue / channels.length;
+      }
+      return { float32Array, meanValues };
+    }
+  }
+
+  /**
+   * Converts 32-bit float data to 16-bit integers
+   */
+  floatTo16BitPCM(float32Array) {
+    const buffer = new ArrayBuffer(float32Array.length * 2);
+    const view = new DataView(buffer);
+    let offset = 0;
+    for (let i = 0; i < float32Array.length; i++, offset += 2) {
+      let s = Math.max(-1, Math.min(1, float32Array[i]));
+      view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+    }
+    return buffer;
+  }
+
+  /**
+   * Retrieves the most recent amplitude values from the audio stream
+   * @param {number} channel
+   */
+  getValues(channel = -1) {
+    const channels = this.readChannelData(this.chunks, channel);
+    const { meanValues } = this.formatAudioData(channels);
+    return { meanValues, channels };
+  }
+
+  /**
+   * Exports chunks as an audio/wav file
+   */
+  export() {
+    const channels = this.readChannelData(this.chunks);
+    const { float32Array, meanValues } = this.formatAudioData(channels);
+    const audioData = this.floatTo16BitPCM(float32Array);
+    return {
+      meanValues: meanValues,
+      audio: {
+        bitsPerSample: 16,
+        channels: channels,
+        data: audioData,
+      },
+    };
+  }
+
+  receive(e) {
+    const { event, id } = e.data;
+    let receiptData = {};
+    switch (event) {
+      case 'start':
+        this.recording = true;
+        break;
+      case 'stop':
+        this.recording = false;
+        break;
+      case 'clear':
+        this.initialize();
+        break;
+      case 'export':
+        receiptData = this.export();
+        break;
+      case 'read':
+        receiptData = this.getValues();
+        break;
+      default:
+        break;
+    }
+    // Always send back receipt
+    this.port.postMessage({ event: 'receipt', id, data: receiptData });
+  }
+
+  sendChunk(chunk) {
+    const channels = this.readChannelData([chunk]);
+    const { float32Array, meanValues } = this.formatAudioData(channels);
+    const rawAudioData = this.floatTo16BitPCM(float32Array);
+    const monoAudioData = this.floatTo16BitPCM(meanValues);
+    this.port.postMessage({
+      event: 'chunk',
+      data: {
+        mono: monoAudioData,
+        raw: rawAudioData,
+      },
+    });
+  }
+
+  process(inputList, outputList, parameters) {
+    // Copy input to output (e.g. speakers)
+    // Note that this creates choppy sounds with Mac products
+    const sourceLimit = Math.min(inputList.length, outputList.length);
+    for (let inputNum = 0; inputNum < sourceLimit; inputNum++) {
+      const input = inputList[inputNum];
+      const output = outputList[inputNum];
+      const channelCount = Math.min(input.length, output.length);
+      for (let channelNum = 0; channelNum < channelCount; channelNum++) {
+        input[channelNum].forEach((sample, i) => {
+          output[channelNum][i] = sample;
+        });
+      }
+    }
+    const inputs = inputList[0];
+    // There's latency at the beginning of a stream before recording starts
+    // Make sure we actually receive audio data before we start storing chunks
+    let sliceIndex = 0;
+    if (!this.foundAudio) {
+      for (const channel of inputs) {
+        sliceIndex = 0; // reset for each channel
+        if (this.foundAudio) {
+          break;
+        }
+        if (channel) {
+          for (const value of channel) {
+            if (value !== 0) {
+              // find only one non-zero entry in any channel
+              this.foundAudio = true;
+              break;
+            } else {
+              sliceIndex++;
+            }
+          }
+        }
+      }
+    }
+    if (inputs && inputs[0] && this.foundAudio && this.recording) {
+      // We need to copy the TypedArray, because the \`process\`
+      // internals will reuse the same buffer to hold each input
+      const chunk = inputs.map((input) => input.slice(sliceIndex));
+      this.chunks.push(chunk);
+      this.sendChunk(chunk);
+    }
+    return true;
+  }
+}
+
+registerProcessor('audio_processor', AudioProcessor);
+`;
+
+const script = new Blob([AudioProcessorWorklet], {
+  type: 'application/javascript'
+});
+const src = URL.createObjectURL(script);
+export const AudioProcessorSrc = src;
diff --git a/libs/react-client/src/wavtools/worklets/stream_processor.js b/libs/react-client/src/wavtools/worklets/stream_processor.js
new file mode 100644
index 0000000000..b594bc511c
--- /dev/null
+++ b/libs/react-client/src/wavtools/worklets/stream_processor.js
@@ -0,0 +1,96 @@
+export const StreamProcessorWorklet = `
+class StreamProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this.hasStarted = false;
+    this.hasInterrupted = false;
+    this.outputBuffers = [];
+    this.bufferLength = 128;
+    this.write = { buffer: new Float32Array(this.bufferLength), trackId: null };
+    this.writeOffset = 0;
+    this.trackSampleOffsets = {};
+    this.port.onmessage = (event) => {
+      if (event.data) {
+        const payload = event.data;
+        if (payload.event === 'write') {
+          const int16Array = payload.buffer;
+          const float32Array = new Float32Array(int16Array.length);
+          for (let i = 0; i < int16Array.length; i++) {
+            float32Array[i] = int16Array[i] / 0x8000; // Convert Int16 to Float32
+          }
+          this.writeData(float32Array, payload.trackId);
+        } else if (
+          payload.event === 'offset' ||
+          payload.event === 'interrupt'
+        ) {
+          const requestId = payload.requestId;
+          const trackId = this.write.trackId;
+          const offset = this.trackSampleOffsets[trackId] || 0;
+          this.port.postMessage({
+            event: 'offset',
+            requestId,
+            trackId,
+            offset,
+          });
+          if (payload.event === 'interrupt') {
+            this.hasInterrupted = true;
+          }
+        } else {
+          throw new Error(\`Unhandled event "\${payload.event}"\`);
+        }
+      }
+    };
+  }
+
+  writeData(float32Array, trackId = null) {
+    let { buffer } = this.write;
+    let offset = this.writeOffset;
+    for (let i = 0; i < float32Array.length; i++) {
+      buffer[offset++] = float32Array[i];
+      if (offset >= buffer.length) {
+        this.outputBuffers.push(this.write);
+        this.write = { buffer: new Float32Array(this.bufferLength), trackId };
+        buffer = this.write.buffer;
+        offset = 0;
+      }
+    }
+    this.writeOffset = offset;
+    return true;
+  }
+
+  process(inputs, outputs, parameters) {
+    const output = outputs[0];
+    const outputChannelData = output[0];
+    const outputBuffers = this.outputBuffers;
+    if (this.hasInterrupted) {
+      this.port.postMessage({ event: 'stop' });
+      return false;
+    } else if (outputBuffers.length) {
+      this.hasStarted = true;
+      const { buffer, trackId } = outputBuffers.shift();
+      for (let i = 0; i < outputChannelData.length; i++) {
+        outputChannelData[i] = buffer[i] || 0;
+      }
+      if (trackId) {
+        this.trackSampleOffsets[trackId] =
+          this.trackSampleOffsets[trackId] || 0;
+        this.trackSampleOffsets[trackId] += buffer.length;
+      }
+      return true;
+    } else if (this.hasStarted) {
+      this.port.postMessage({ event: 'stop' });
+      return false;
+    } else {
+      return true;
+    }
+  }
+}
+
+registerProcessor('stream_processor', StreamProcessor);
+`;
+
+const script = new Blob([StreamProcessorWorklet], {
+  type: 'application/javascript'
+});
+const src = URL.createObjectURL(script);
+export const StreamProcessorSrc = src;