Willy/realtime (Chainlit#1401)

* Bump literalai depdendency. * Update literalai imports * Remove unused imports from SQLAlchemy tests. * Unit tests for LiteralDataLayer. * Consistent LiteralAI to Chainlit conversion, resolve PaginatedResponse exceptions. - Create LiteralToChainlitConverter class for handling conversions - Implement methods for converting steps, threads, and attachments - Add support for different Element subclasses based on metadata - Allow manual setting of thread_id and id for Step and Element * Attempt to satisfy mypy (plus cleaner approach). * feat: add realtime audio * fix: default config * fix: lint --------- Co-authored-by: Mathijs de Bruin <[email protected]> Co-authored-by: EWouters <[email protected]>
Progressive-Insurance · Oct 4, 2024 · 8882619 · 8882619
1 parent 79639b6
commit 8882619
Show file tree

Hide file tree

Showing 36 changed files with 2,007 additions and 400 deletions.
diff --git a/backend/chainlit/__init__.py b/backend/chainlit/__init__.py
@@ -43,7 +43,7 @@
 )
 from chainlit.step import Step, step
 from chainlit.sync import make_async, run_sync
-from chainlit.types import AudioChunk, ChatProfile, Starter
+from chainlit.types import InputAudioChunk, OutputAudioChunk, ChatProfile, Starter
 from chainlit.user import PersistedUser, User
 from chainlit.user_session import user_session
 from chainlit.utils import make_module_getattr
@@ -56,6 +56,7 @@
     author_rename,
     header_auth_callback,
     oauth_callback,
+    on_audio_start,
     on_audio_chunk,
     on_audio_end,
     on_chat_end,
@@ -117,7 +118,8 @@ def acall(self):
     "user_session",
     "chat_context",
     "CopilotFunction",
-    "AudioChunk",
+    "InputAudioChunk",
+    "OutputAudioChunk",
     "Action",
     "User",
     "PersistedUser",
@@ -176,6 +178,7 @@ def acall(self):
     "set_chat_profiles",
     "set_starters",
     "on_chat_end",
+    "on_audio_start",
     "on_audio_chunk",
     "on_audio_end",
     "author_rename",

diff --git a/backend/chainlit/callbacks.py b/backend/chainlit/callbacks.py
@@ -209,13 +209,25 @@ def on_chat_end(func: Callable) -> Callable:
     return func
 
 
+@trace
+def on_audio_start(func: Callable) -> Callable:
+    """
+    Hook to react to the user initiating audio.
+
+    Returns:
+        Callable[], Any]: The decorated hook.
+    """
+
+    config.code.on_audio_start = wrap_user_function(func, with_task=False)
+    return func
+
 @trace
 def on_audio_chunk(func: Callable) -> Callable:
     """
     Hook to react to the audio chunks being sent.
 
     Args:
-        chunk (AudioChunk): The audio chunk being sent.
+        chunk (InputAudioChunk): The audio chunk being sent.
 
     Returns:
         Callable[], Any]: The decorated hook.
@@ -230,9 +242,6 @@ def on_audio_end(func: Callable) -> Callable:
     """
     Hook to react to the audio stream ending. This is called after the last audio chunk is sent.
 
-    Args:
-    elements ([List[Element]): The files that were uploaded before starting the audio stream (if any).
-
     Returns:
         Callable[], Any]: The decorated hook.
     """

diff --git a/backend/chainlit/config.py b/backend/chainlit/config.py
@@ -28,9 +28,8 @@
 
 if TYPE_CHECKING:
     from chainlit.action import Action
-    from chainlit.element import ElementBased
     from chainlit.message import Message
-    from chainlit.types import AudioChunk, ChatProfile, Starter, ThreadDict
+    from chainlit.types import InputAudioChunk, ChatProfile, Starter, ThreadDict
     from chainlit.user import User
     from fastapi import Request, Response
 
@@ -93,18 +92,8 @@
     max_size_mb = 500
 
 [features.audio]
-    # Threshold for audio recording
-    min_decibels = -45
-    # Delay for the user to start speaking in MS
-    initial_silence_timeout = 3000
-    # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
-    silence_timeout = 1500
-    # Above this duration (MS), the recording will forcefully stop.
-    max_duration = 15000
-    # Duration of the audio chunks in MS
-    chunk_duration = 1000
     # Sample rate of the audio
-    sample_rate = 44100
+    sample_rate = 24000
 
 [UI]
 # Name of the assistant.
@@ -237,12 +226,7 @@ class SpontaneousFileUploadFeature(DataClassJsonMixin):
 
 @dataclass
 class AudioFeature(DataClassJsonMixin):
-    min_decibels: int = -45
-    initial_silence_timeout: int = 2000
-    silence_timeout: int = 1500
-    chunk_duration: int = 1000
-    max_duration: int = 15000
-    sample_rate: int = 44100
+    sample_rate: int = 24000
     enabled: bool = False
 
 
@@ -297,8 +281,9 @@ class CodeSettings:
     on_chat_end: Optional[Callable[[], Any]] = None
     on_chat_resume: Optional[Callable[["ThreadDict"], Any]] = None
     on_message: Optional[Callable[["Message"], Any]] = None
-    on_audio_chunk: Optional[Callable[["AudioChunk"], Any]] = None
-    on_audio_end: Optional[Callable[[List["ElementBased"]], Any]] = None
+    on_audio_start: Optional[Callable[[], Any]] = None
+    on_audio_chunk: Optional[Callable[["InputAudioChunk"], Any]] = None
+    on_audio_end: Optional[Callable[[], Any]] = None
 
     author_rename: Optional[Callable[[str], Awaitable[str]]] = None
     on_settings_update: Optional[Callable[[Dict[str, Any]], Any]] = None

diff --git a/backend/chainlit/emitter.py b/backend/chainlit/emitter.py
@@ -17,6 +17,7 @@
     FileReference,
     MessagePayload,
     ThreadDict,
+    OutputAudioChunk
 )
 from chainlit.user import PersistedUser
 from literalai.helper import utc_now
@@ -51,6 +52,18 @@ async def resume_thread(self, thread_dict: ThreadDict):
     async def send_element(self, element_dict: ElementDict):
         """Stub method to send an element to the UI."""
         pass
+
+    async def update_audio_connection(self, state: Literal["on", "off"]):
+        """Audio connection signaling."""
+        pass
+
+    async def send_audio_chunk(self, chunk: OutputAudioChunk):
+        """Stub method to send an audio chunk to the UI."""
+        pass
+
+    async def send_audio_interrupt(self):
+        """Stub method to interrupt the current audio response."""
+        pass
 
     async def send_step(self, step_dict: StepDict):
         """Stub method to send a message to the UI."""
@@ -157,6 +170,18 @@ def resume_thread(self, thread_dict: ThreadDict):
         """Send a thread to the UI to resume it"""
         return self.emit("resume_thread", thread_dict)
 
+    async def update_audio_connection(self, state: Literal["on", "off"]):
+        """Audio connection signaling."""
+        await self.emit("audio_connection", state)
+
+    async def send_audio_chunk(self, chunk: OutputAudioChunk):
+        """Send an audio chunk to the UI."""
+        await self.emit("audio_chunk", chunk)
+
+    async def send_audio_interrupt(self):
+        """Method to interrupt the current audio response."""
+        await self.emit("audio_interrupt", {})
+
     async def send_element(self, element_dict: ElementDict):
         """Stub method to send an element to the UI."""
         await self.emit("element", element_dict)

diff --git a/backend/chainlit/socket.py b/backend/chainlit/socket.py
@@ -18,9 +18,8 @@
 from chainlit.session import WebsocketSession
 from chainlit.telemetry import trace_event
 from chainlit.types import (
-    AudioChunk,
-    AudioChunkPayload,
-    AudioEndPayload,
+    InputAudioChunk,
+    InputAudioChunkPayload,
     MessagePayload,
 )
 from chainlit.user_session import user_sessions
@@ -314,19 +313,31 @@ async def message(sid, payload: MessagePayload):
     session.current_task = task
 
 
+@sio.on("audio_start")
+async def audio_start(sid):
+    """Handle audio init."""
+    session = WebsocketSession.require(sid)
+
+    context = init_ws_context(session)
+    if config.code.on_audio_start:
+       connected = bool(await config.code.on_audio_start())
+       connection_state = "on" if connected else "off"
+       await context.emitter.update_audio_connection(connection_state)
+
+
 @sio.on("audio_chunk")
-async def audio_chunk(sid, payload: AudioChunkPayload):
+async def audio_chunk(sid, payload: InputAudioChunkPayload):
     """Handle an audio chunk sent by the user."""
     session = WebsocketSession.require(sid)
 
     init_ws_context(session)
 
     if config.code.on_audio_chunk:
-        asyncio.create_task(config.code.on_audio_chunk(AudioChunk(**payload)))
+        asyncio.create_task(config.code.on_audio_chunk(InputAudioChunk(**payload)))
 
 
 @sio.on("audio_end")
-async def audio_end(sid, payload: AudioEndPayload):
+async def audio_end(sid):
     """Handle the end of the audio stream."""
     session = WebsocketSession.require(sid)
     try:
@@ -337,18 +348,9 @@ async def audio_end(sid, payload: AudioEndPayload):
             session.has_first_interaction = True
             asyncio.create_task(context.emitter.init_thread("audio"))
 
-        file_elements = []
         if config.code.on_audio_end:
-            file_refs = payload.get("fileReferences")
-            if file_refs:
-                files = [
-                    session.files[file["id"]]
-                    for file in file_refs
-                    if file["id"] in session.files
-                ]
-                file_elements = [Element.from_dict(file) for file in files]
-
-            await config.code.on_audio_end(file_elements)
+            await config.code.on_audio_end()
+
     except asyncio.CancelledError:
         pass
     except Exception as e:

diff --git a/backend/chainlit/translations/en-US.json b/backend/chainlit/translations/en-US.json
@@ -124,7 +124,8 @@
           },
           "speechButton": {
             "start": "Start recording",
-            "stop": "Stop recording"
+            "stop": "Stop recording",
+            "loading": "Connecting"
           },
           "SubmitButton": {
             "sendMessage": "Send message",

diff --git a/backend/chainlit/types.py b/backend/chainlit/types.py
@@ -154,24 +154,24 @@ class MessagePayload(TypedDict):
     fileReferences: Optional[List[FileReference]]
 
 
-class AudioChunkPayload(TypedDict):
+class InputAudioChunkPayload(TypedDict):
     isStart: bool
     mimeType: str
     elapsedTime: float
     data: bytes
 
 
 @dataclass
-class AudioChunk:
+class InputAudioChunk:
     isStart: bool
     mimeType: str
     elapsedTime: float
     data: bytes
 
-
-class AudioEndPayload(TypedDict):
-    fileReferences: Optional[List[FileReference]]
-
+class OutputAudioChunk(TypedDict):
+    track: str
+    mimeType: str
+    data: bytes
 
 @dataclass
 class AskFileResponse:

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "chainlit"
-version = "1.3.0rc0"
+version = "1.3.0rc1"
 keywords = [
     'LLM',
     'Agents',

diff --git a/frontend/src/assets/microphone.tsx b/frontend/src/assets/microphone.tsx
@@ -15,7 +15,7 @@ const MicrophoneIcon = (props: SvgIconProps) => {
     >
       <path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z" />
       <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
-      <line x1="12" x2="12" y1="19" y2="22" />{' '}
+      <line x1="12" x2="12" y1="19" y2="22" />
     </SvgIcon>
   );
 };

diff --git a/frontend/src/assets/microphoneOff.tsx b/frontend/src/assets/microphoneOff.tsx
@@ -0,0 +1,26 @@
+import SvgIcon, { SvgIconProps } from '@mui/material/SvgIcon';
+
+const MicrophoneOffIcon = (props: SvgIconProps) => {
+  return (
+    <SvgIcon
+      {...props}
+      style={{
+        strokeLinecap: 'round',
+        strokeLinejoin: 'round',
+        strokeWidth: 2,
+        fill: 'none',
+        stroke: 'currentColor'
+      }}
+      viewBox="0 0 24 24"
+    >
+      <line x1="2" x2="22" y1="2" y2="22" />
+      <path d="M18.89 13.23A7.12 7.12 0 0 0 19 12v-2" />
+      <path d="M5 10v2a7 7 0 0 0 12 5" />
+      <path d="M15 9.34V5a3 3 0 0 0-5.68-1.33" />
+      <path d="M9 9v3a3 3 0 0 0 5.12 2.12" />
+      <line x1="12" x2="12" y1="19" y2="22" />
+    </SvgIcon>
+  );
+};
+
+export default MicrophoneOffIcon;
diff --git a/frontend/src/components/molecules/messages/Messages.tsx b/frontend/src/components/molecules/messages/Messages.tsx
@@ -62,6 +62,7 @@ const Messages = memo(
               <>
                 {m.steps?.length ? (
                   <Messages
+                    key={m.id}
                     messages={m.steps}
                     elements={elements}
                     actions={actions}
@@ -71,6 +72,7 @@ const Messages = memo(
                   />
                 ) : null}
                 <MessageLoader
+                  key={m.id + 'loader'}
                   show={showToolCoTLoader || showHiddenCoTLoader}
                 />
               </>