From 8882619742b8596cc016be084a51049fd1687f8f Mon Sep 17 00:00:00 2001 From: Willy Douhard Date: Fri, 4 Oct 2024 18:22:53 +0200 Subject: [PATCH] Willy/realtime (#1401) * Bump literalai depdendency. * Update literalai imports * Remove unused imports from SQLAlchemy tests. * Unit tests for LiteralDataLayer. * Consistent LiteralAI to Chainlit conversion, resolve PaginatedResponse exceptions. - Create LiteralToChainlitConverter class for handling conversions - Implement methods for converting steps, threads, and attachments - Add support for different Element subclasses based on metadata - Allow manual setting of thread_id and id for Step and Element * Attempt to satisfy mypy (plus cleaner approach). * feat: add realtime audio * fix: default config * fix: lint --------- Co-authored-by: Mathijs de Bruin Co-authored-by: EWouters <6179932+EWouters@users.noreply.github.com> --- backend/chainlit/__init__.py | 7 +- backend/chainlit/callbacks.py | 17 +- backend/chainlit/config.py | 27 +- backend/chainlit/emitter.py | 25 + backend/chainlit/socket.py | 36 +- backend/chainlit/translations/en-US.json | 3 +- backend/chainlit/types.py | 12 +- backend/pyproject.toml | 2 +- frontend/src/assets/microphone.tsx | 2 +- frontend/src/assets/microphoneOff.tsx | 26 + .../molecules/messages/Messages.tsx | 2 + .../organisms/chat/inputBox/AudioPresence.tsx | 123 ++++ .../chat/inputBox/MicButton/RecordScreen.tsx | 58 -- .../chat/inputBox/MicButton/index.tsx | 98 ++-- .../organisms/chat/inputBox/footer.tsx | 26 + .../organisms/chat/inputBox/index.tsx | 16 +- .../organisms/chat/inputBox/waterMark.tsx | 43 +- frontend/src/components/organisms/header.tsx | 18 +- frontend/src/pages/ResumeButton.tsx | 4 +- libs/copilot/src/components/Input.tsx | 4 +- libs/react-client/src/index.ts | 2 + libs/react-client/src/state.ts | 23 + libs/react-client/src/types/audio.ts | 5 + libs/react-client/src/types/config.ts | 6 +- libs/react-client/src/useAudio.ts | 218 +------ libs/react-client/src/useChatInteract.ts | 21 +- libs/react-client/src/useChatSession.ts | 48 +- .../src/wavtools/analysis/audio_analysis.js | 203 +++++++ .../src/wavtools/analysis/constants.js | 60 ++ libs/react-client/src/wavtools/index.ts | 7 + libs/react-client/src/wavtools/wav_packer.js | 113 ++++ .../react-client/src/wavtools/wav_recorder.js | 548 ++++++++++++++++++ .../react-client/src/wavtools/wav_renderer.ts | 132 +++++ .../src/wavtools/wav_stream_player.js | 162 ++++++ .../src/wavtools/worklets/audio_processor.js | 214 +++++++ .../src/wavtools/worklets/stream_processor.js | 96 +++ 36 files changed, 2007 insertions(+), 400 deletions(-) create mode 100644 frontend/src/assets/microphoneOff.tsx create mode 100644 frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx delete mode 100644 frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx create mode 100644 frontend/src/components/organisms/chat/inputBox/footer.tsx create mode 100644 libs/react-client/src/types/audio.ts create mode 100644 libs/react-client/src/wavtools/analysis/audio_analysis.js create mode 100644 libs/react-client/src/wavtools/analysis/constants.js create mode 100644 libs/react-client/src/wavtools/index.ts create mode 100644 libs/react-client/src/wavtools/wav_packer.js create mode 100644 libs/react-client/src/wavtools/wav_recorder.js create mode 100644 libs/react-client/src/wavtools/wav_renderer.ts create mode 100644 libs/react-client/src/wavtools/wav_stream_player.js create mode 100644 libs/react-client/src/wavtools/worklets/audio_processor.js create mode 100644 libs/react-client/src/wavtools/worklets/stream_processor.js diff --git a/backend/chainlit/__init__.py b/backend/chainlit/__init__.py index 0506ef38f3..5455fac760 100644 --- a/backend/chainlit/__init__.py +++ b/backend/chainlit/__init__.py @@ -43,7 +43,7 @@ ) from chainlit.step import Step, step from chainlit.sync import make_async, run_sync -from chainlit.types import AudioChunk, ChatProfile, Starter +from chainlit.types import InputAudioChunk, OutputAudioChunk, ChatProfile, Starter from chainlit.user import PersistedUser, User from chainlit.user_session import user_session from chainlit.utils import make_module_getattr @@ -56,6 +56,7 @@ author_rename, header_auth_callback, oauth_callback, + on_audio_start, on_audio_chunk, on_audio_end, on_chat_end, @@ -117,7 +118,8 @@ def acall(self): "user_session", "chat_context", "CopilotFunction", - "AudioChunk", + "InputAudioChunk", + "OutputAudioChunk", "Action", "User", "PersistedUser", @@ -176,6 +178,7 @@ def acall(self): "set_chat_profiles", "set_starters", "on_chat_end", + "on_audio_start", "on_audio_chunk", "on_audio_end", "author_rename", diff --git a/backend/chainlit/callbacks.py b/backend/chainlit/callbacks.py index b559049d7b..02c6feb124 100644 --- a/backend/chainlit/callbacks.py +++ b/backend/chainlit/callbacks.py @@ -209,13 +209,25 @@ def on_chat_end(func: Callable) -> Callable: return func +@trace +def on_audio_start(func: Callable) -> Callable: + """ + Hook to react to the user initiating audio. + + Returns: + Callable[], Any]: The decorated hook. + """ + + config.code.on_audio_start = wrap_user_function(func, with_task=False) + return func + @trace def on_audio_chunk(func: Callable) -> Callable: """ Hook to react to the audio chunks being sent. Args: - chunk (AudioChunk): The audio chunk being sent. + chunk (InputAudioChunk): The audio chunk being sent. Returns: Callable[], Any]: The decorated hook. @@ -230,9 +242,6 @@ def on_audio_end(func: Callable) -> Callable: """ Hook to react to the audio stream ending. This is called after the last audio chunk is sent. - Args: - elements ([List[Element]): The files that were uploaded before starting the audio stream (if any). - Returns: Callable[], Any]: The decorated hook. """ diff --git a/backend/chainlit/config.py b/backend/chainlit/config.py index 5700479677..60bc764b47 100644 --- a/backend/chainlit/config.py +++ b/backend/chainlit/config.py @@ -28,9 +28,8 @@ if TYPE_CHECKING: from chainlit.action import Action - from chainlit.element import ElementBased from chainlit.message import Message - from chainlit.types import AudioChunk, ChatProfile, Starter, ThreadDict + from chainlit.types import InputAudioChunk, ChatProfile, Starter, ThreadDict from chainlit.user import User from fastapi import Request, Response @@ -93,18 +92,8 @@ max_size_mb = 500 [features.audio] - # Threshold for audio recording - min_decibels = -45 - # Delay for the user to start speaking in MS - initial_silence_timeout = 3000 - # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop. - silence_timeout = 1500 - # Above this duration (MS), the recording will forcefully stop. - max_duration = 15000 - # Duration of the audio chunks in MS - chunk_duration = 1000 # Sample rate of the audio - sample_rate = 44100 + sample_rate = 24000 [UI] # Name of the assistant. @@ -237,12 +226,7 @@ class SpontaneousFileUploadFeature(DataClassJsonMixin): @dataclass class AudioFeature(DataClassJsonMixin): - min_decibels: int = -45 - initial_silence_timeout: int = 2000 - silence_timeout: int = 1500 - chunk_duration: int = 1000 - max_duration: int = 15000 - sample_rate: int = 44100 + sample_rate: int = 24000 enabled: bool = False @@ -297,8 +281,9 @@ class CodeSettings: on_chat_end: Optional[Callable[[], Any]] = None on_chat_resume: Optional[Callable[["ThreadDict"], Any]] = None on_message: Optional[Callable[["Message"], Any]] = None - on_audio_chunk: Optional[Callable[["AudioChunk"], Any]] = None - on_audio_end: Optional[Callable[[List["ElementBased"]], Any]] = None + on_audio_start: Optional[Callable[[], Any]] = None + on_audio_chunk: Optional[Callable[["InputAudioChunk"], Any]] = None + on_audio_end: Optional[Callable[[], Any]] = None author_rename: Optional[Callable[[str], Awaitable[str]]] = None on_settings_update: Optional[Callable[[Dict[str, Any]], Any]] = None diff --git a/backend/chainlit/emitter.py b/backend/chainlit/emitter.py index e80bd46dd2..df8a78e9f4 100644 --- a/backend/chainlit/emitter.py +++ b/backend/chainlit/emitter.py @@ -17,6 +17,7 @@ FileReference, MessagePayload, ThreadDict, + OutputAudioChunk ) from chainlit.user import PersistedUser from literalai.helper import utc_now @@ -51,6 +52,18 @@ async def resume_thread(self, thread_dict: ThreadDict): async def send_element(self, element_dict: ElementDict): """Stub method to send an element to the UI.""" pass + + async def update_audio_connection(self, state: Literal["on", "off"]): + """Audio connection signaling.""" + pass + + async def send_audio_chunk(self, chunk: OutputAudioChunk): + """Stub method to send an audio chunk to the UI.""" + pass + + async def send_audio_interrupt(self): + """Stub method to interrupt the current audio response.""" + pass async def send_step(self, step_dict: StepDict): """Stub method to send a message to the UI.""" @@ -157,6 +170,18 @@ def resume_thread(self, thread_dict: ThreadDict): """Send a thread to the UI to resume it""" return self.emit("resume_thread", thread_dict) + async def update_audio_connection(self, state: Literal["on", "off"]): + """Audio connection signaling.""" + await self.emit("audio_connection", state) + + async def send_audio_chunk(self, chunk: OutputAudioChunk): + """Send an audio chunk to the UI.""" + await self.emit("audio_chunk", chunk) + + async def send_audio_interrupt(self): + """Method to interrupt the current audio response.""" + await self.emit("audio_interrupt", {}) + async def send_element(self, element_dict: ElementDict): """Stub method to send an element to the UI.""" await self.emit("element", element_dict) diff --git a/backend/chainlit/socket.py b/backend/chainlit/socket.py index 952fd38ae9..4cfc42fa9a 100644 --- a/backend/chainlit/socket.py +++ b/backend/chainlit/socket.py @@ -18,9 +18,8 @@ from chainlit.session import WebsocketSession from chainlit.telemetry import trace_event from chainlit.types import ( - AudioChunk, - AudioChunkPayload, - AudioEndPayload, + InputAudioChunk, + InputAudioChunkPayload, MessagePayload, ) from chainlit.user_session import user_sessions @@ -314,19 +313,31 @@ async def message(sid, payload: MessagePayload): session.current_task = task +@sio.on("audio_start") +async def audio_start(sid): + """Handle audio init.""" + session = WebsocketSession.require(sid) + + context = init_ws_context(session) + if config.code.on_audio_start: + connected = bool(await config.code.on_audio_start()) + connection_state = "on" if connected else "off" + await context.emitter.update_audio_connection(connection_state) + + @sio.on("audio_chunk") -async def audio_chunk(sid, payload: AudioChunkPayload): +async def audio_chunk(sid, payload: InputAudioChunkPayload): """Handle an audio chunk sent by the user.""" session = WebsocketSession.require(sid) init_ws_context(session) if config.code.on_audio_chunk: - asyncio.create_task(config.code.on_audio_chunk(AudioChunk(**payload))) + asyncio.create_task(config.code.on_audio_chunk(InputAudioChunk(**payload))) @sio.on("audio_end") -async def audio_end(sid, payload: AudioEndPayload): +async def audio_end(sid): """Handle the end of the audio stream.""" session = WebsocketSession.require(sid) try: @@ -337,18 +348,9 @@ async def audio_end(sid, payload: AudioEndPayload): session.has_first_interaction = True asyncio.create_task(context.emitter.init_thread("audio")) - file_elements = [] if config.code.on_audio_end: - file_refs = payload.get("fileReferences") - if file_refs: - files = [ - session.files[file["id"]] - for file in file_refs - if file["id"] in session.files - ] - file_elements = [Element.from_dict(file) for file in files] - - await config.code.on_audio_end(file_elements) + await config.code.on_audio_end() + except asyncio.CancelledError: pass except Exception as e: diff --git a/backend/chainlit/translations/en-US.json b/backend/chainlit/translations/en-US.json index 2bb9b6f4d8..f8a7fd096e 100644 --- a/backend/chainlit/translations/en-US.json +++ b/backend/chainlit/translations/en-US.json @@ -124,7 +124,8 @@ }, "speechButton": { "start": "Start recording", - "stop": "Stop recording" + "stop": "Stop recording", + "loading": "Connecting" }, "SubmitButton": { "sendMessage": "Send message", diff --git a/backend/chainlit/types.py b/backend/chainlit/types.py index 1ca9d18bb6..b094716f93 100644 --- a/backend/chainlit/types.py +++ b/backend/chainlit/types.py @@ -154,7 +154,7 @@ class MessagePayload(TypedDict): fileReferences: Optional[List[FileReference]] -class AudioChunkPayload(TypedDict): +class InputAudioChunkPayload(TypedDict): isStart: bool mimeType: str elapsedTime: float @@ -162,16 +162,16 @@ class AudioChunkPayload(TypedDict): @dataclass -class AudioChunk: +class InputAudioChunk: isStart: bool mimeType: str elapsedTime: float data: bytes - -class AudioEndPayload(TypedDict): - fileReferences: Optional[List[FileReference]] - +class OutputAudioChunk(TypedDict): + track: str + mimeType: str + data: bytes @dataclass class AskFileResponse: diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 503bd6ed43..c8750e9956 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "chainlit" -version = "1.3.0rc0" +version = "1.3.0rc1" keywords = [ 'LLM', 'Agents', diff --git a/frontend/src/assets/microphone.tsx b/frontend/src/assets/microphone.tsx index 226b649e07..04df14e711 100644 --- a/frontend/src/assets/microphone.tsx +++ b/frontend/src/assets/microphone.tsx @@ -15,7 +15,7 @@ const MicrophoneIcon = (props: SvgIconProps) => { > - {' '} + ); }; diff --git a/frontend/src/assets/microphoneOff.tsx b/frontend/src/assets/microphoneOff.tsx new file mode 100644 index 0000000000..10141bab3f --- /dev/null +++ b/frontend/src/assets/microphoneOff.tsx @@ -0,0 +1,26 @@ +import SvgIcon, { SvgIconProps } from '@mui/material/SvgIcon'; + +const MicrophoneOffIcon = (props: SvgIconProps) => { + return ( + + + + + + + + + ); +}; + +export default MicrophoneOffIcon; diff --git a/frontend/src/components/molecules/messages/Messages.tsx b/frontend/src/components/molecules/messages/Messages.tsx index 024874299d..2c3063d443 100644 --- a/frontend/src/components/molecules/messages/Messages.tsx +++ b/frontend/src/components/molecules/messages/Messages.tsx @@ -62,6 +62,7 @@ const Messages = memo( <> {m.steps?.length ? ( ) : null} diff --git a/frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx b/frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx new file mode 100644 index 0000000000..44ad6ae5f9 --- /dev/null +++ b/frontend/src/components/organisms/chat/inputBox/AudioPresence.tsx @@ -0,0 +1,123 @@ +import { useEffect, useRef } from 'react'; + +import { Box, Typography } from '@mui/material'; +import { useTheme } from '@mui/material/styles'; + +import { WavRenderer, useAudio } from '@chainlit/react-client'; + +interface Props { + type: 'client' | 'server'; + height: number; + width: number; + barCount: number; + barSpacing: number; +} + +export default function AudioPresence({ + type, + height, + width, + barCount, + barSpacing +}: Props) { + const theme = useTheme(); + const { wavRecorder, wavStreamPlayer, isAiSpeaking } = useAudio(); + const canvasRef = useRef(null); + + width = type === 'server' && !isAiSpeaking ? height : width; + + useEffect(() => { + let isLoaded = true; + const dpr = window.devicePixelRatio || 1; + let bounceDirection = 1; + let bounceFactor = 0; + + const getData = () => { + if (type === 'server' && isAiSpeaking) { + return wavStreamPlayer.analyser + ? wavStreamPlayer.getFrequencies('voice') + : { values: new Float32Array([0]) }; + } else { + return wavRecorder.recording + ? wavRecorder.getFrequencies('voice') + : { values: new Float32Array([0]) }; + } + }; + + const render = () => { + if (!isLoaded) return; + const canvas = canvasRef.current; + let ctx: CanvasRenderingContext2D | null = null; + + if (canvas) { + // Set the canvas size based on the DPR + canvas.width = width * dpr; + canvas.height = height * dpr; + canvas.style.width = `${width}px`; + canvas.style.height = `${height}px`; + + ctx = ctx || canvas.getContext('2d'); + if (ctx) { + // Scale the context to account for the DPR + ctx.scale(dpr, dpr); + + ctx.clearRect(0, 0, width, height); // Use CSS dimensions here + const result = getData(); + + if (type === 'server' && !isAiSpeaking) { + // Draw a bouncing circle + const amplitude = Math.min( + Math.max(0.6, Math.max(...result.values)), + 1 + ); // Ensure a minimum amplitude + const maxRadius = width / 2; + const baseRadius = maxRadius * amplitude; + const radius = baseRadius * (0.6 + 0.2 * bounceFactor); + const centerX = width / 2; + const centerY = height / 2; + + ctx.fillStyle = theme.palette.text.primary; + ctx.beginPath(); + ctx.arc(centerX, centerY, radius, 0, Math.PI * 2); + ctx.fill(); + + const newFactor = bounceFactor + 0.01 * bounceDirection; + if (newFactor > 1 || newFactor < 0) { + bounceDirection *= -1; + } + bounceFactor = Math.max(0, Math.min(newFactor, 1)); + } else { + WavRenderer.drawBars( + ctx, + result.values, + width, + height, + theme.palette.text.primary, + barCount, + 0, + barSpacing, + true + ); + } + } + } + window.requestAnimationFrame(render); + }; + render(); + + return () => { + isLoaded = false; + }; + }, [height, width, barCount, barSpacing, theme, wavRecorder, isAiSpeaking]); + + return ( + + {type === 'server' && !isAiSpeaking ? ( + + Listening + + ) : null} + + + ); +} diff --git a/frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx b/frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx deleted file mode 100644 index fb25d2a030..0000000000 --- a/frontend/src/components/organisms/chat/inputBox/MicButton/RecordScreen.tsx +++ /dev/null @@ -1,58 +0,0 @@ -import { grey } from 'theme/palette'; - -import { Box } from '@mui/material'; -import Backdrop from '@mui/material/Backdrop'; - -import MicrophoneIcon from 'assets/microphone'; - -interface Props { - open?: boolean; - isSpeaking?: boolean; -} - -export default function RecordScreen({ open, isSpeaking }: Props) { - return ( - theme.zIndex.drawer + 1 }} - open={!!open} - > - - - - - - - - - - - ); -} diff --git a/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx b/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx index 8be273e7dc..6567b95e47 100644 --- a/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx +++ b/frontend/src/components/organisms/chat/inputBox/MicButton/index.tsx @@ -1,85 +1,87 @@ -import { useCallback, useEffect, useMemo } from 'react'; import { useHotkeys } from 'react-hotkeys-hook'; -import { useRecoilState, useRecoilValue } from 'recoil'; -import { toast } from 'sonner'; -import { IconButton, Theme, Tooltip, useMediaQuery } from '@mui/material'; +import { + CircularProgress, + IconButton, + Theme, + Tooltip, + useMediaQuery +} from '@mui/material'; -import { askUserState, useAudio, useConfig } from '@chainlit/react-client'; +import { useAudio, useConfig } from '@chainlit/react-client'; import { Translator } from 'components/i18n'; import MicrophoneIcon from 'assets/microphone'; - -import { attachmentsState } from 'state/chat'; - -import RecordScreen from './RecordScreen'; +import MicrophoneOffIcon from 'assets/microphoneOff'; interface Props { disabled?: boolean; } const MicButton = ({ disabled }: Props) => { - const askUser = useRecoilValue(askUserState); const { config } = useConfig(); - const { - startRecording: _startRecording, - isRecording, - isSpeaking, - isRecordingFinished, - error - } = useAudio(config?.features.audio); - const [attachments, setAttachments] = useRecoilState(attachmentsState); - - disabled = disabled || !!askUser; - - useEffect(() => { - if (isRecordingFinished) setAttachments([]); - }, [isRecordingFinished]); - - useEffect(() => { - if (!error) return; - toast.error(error); - }, [error]); - - const fileReferences = useMemo(() => { - return attachments - ?.filter((a) => !!a.serverId) - .map((a) => ({ id: a.serverId! })); - }, [attachments]); - - const startRecording = useCallback(() => { - if (disabled) return; - _startRecording(fileReferences); - }, [_startRecording, fileReferences, disabled]); - - useHotkeys('p', startRecording); + const { startConversation, endConversation, audioConnection } = useAudio(); + const isEnabled = !!config?.features.audio.enabled; + + useHotkeys( + 'p', + () => { + if (!isEnabled) return; + if (audioConnection === 'on') return endConversation(); + return startConversation(); + }, + [isEnabled, audioConnection, startConversation, endConversation] + ); const size = useMediaQuery((theme) => theme.breakpoints.down('sm')) ? 'small' : 'medium'; - if (!config?.features.audio.enabled) return null; + if (!isEnabled) return null; return ( <> - } > - + {audioConnection === 'on' ? ( + + ) : null} + {audioConnection === 'off' ? ( + + ) : null} + {audioConnection === 'connecting' ? ( + + ) : null} diff --git a/frontend/src/components/organisms/chat/inputBox/footer.tsx b/frontend/src/components/organisms/chat/inputBox/footer.tsx new file mode 100644 index 0000000000..782af94e6d --- /dev/null +++ b/frontend/src/components/organisms/chat/inputBox/footer.tsx @@ -0,0 +1,26 @@ +import { Stack } from '@mui/material'; + +import { useAudio } from '@chainlit/react-client'; + +import AudioPresence from 'components/organisms/chat/inputBox/AudioPresence'; +import WaterMark from 'components/organisms/chat/inputBox/waterMark'; + +export default function InputBoxFooter() { + const { audioConnection } = useAudio(); + + return ( + + {audioConnection === 'on' ? ( + + ) : ( + + )} + + ); +} diff --git a/frontend/src/components/organisms/chat/inputBox/index.tsx b/frontend/src/components/organisms/chat/inputBox/index.tsx index 66ae63a405..acb4274ed0 100644 --- a/frontend/src/components/organisms/chat/inputBox/index.tsx +++ b/frontend/src/components/organisms/chat/inputBox/index.tsx @@ -14,8 +14,8 @@ import { useLayoutMaxWidth } from 'hooks/useLayoutMaxWidth'; import { IAttachment } from 'state/chat'; import { inputHistoryState } from 'state/userInputHistory'; +import InputBoxFooter from './footer'; import Input from './input'; -import WaterMark from './waterMark'; interface Props { fileSpec: FileSpec; @@ -38,7 +38,6 @@ const InputBox = memo( const { user } = useAuth(); const { sendMessage, replyMessage } = useChatInteract(); - // const tokenCount = useRecoilValue(tokenCountState); const onSubmit = useCallback( async (msg: string, attachments?: IAttachment[]) => { @@ -122,19 +121,8 @@ const InputBox = memo( onSubmit={onSubmit} onReply={onReply} /> - {/* {tokenCount > 0 && ( */} - {/* - - Token usage: {tokenCount} - - */} - {/* )} */} - + ); } diff --git a/frontend/src/components/organisms/chat/inputBox/waterMark.tsx b/frontend/src/components/organisms/chat/inputBox/waterMark.tsx index d1436e3697..9db6d0a908 100644 --- a/frontend/src/components/organisms/chat/inputBox/waterMark.tsx +++ b/frontend/src/components/organisms/chat/inputBox/waterMark.tsx @@ -1,6 +1,6 @@ import { useRecoilValue } from 'recoil'; -import { Stack, Typography } from '@mui/material'; +import { Typography } from '@mui/material'; import { Translator } from 'components/i18n'; @@ -14,29 +14,28 @@ import { settingsState } from 'state/settings'; export default function WaterMark() { const { theme } = useRecoilValue(settingsState); const Logo = theme === 'light' ? LogoLight : LogoDark; + return ( - - + + + + - - - - - - + /> + ); } diff --git a/frontend/src/components/organisms/header.tsx b/frontend/src/components/organisms/header.tsx index d75a4d7bc5..ce5a2242a7 100644 --- a/frontend/src/components/organisms/header.tsx +++ b/frontend/src/components/organisms/header.tsx @@ -4,6 +4,8 @@ import { useRecoilValue } from 'recoil'; import { Box, Stack } from '@mui/material'; import useMediaQuery from '@mui/material/useMediaQuery'; +import { useAudio } from '@chainlit/react-client'; + import UserButton from 'components/atoms/buttons/userButton'; import { Logo } from 'components/atoms/logo'; import ChatProfiles from 'components/molecules/chatProfiles'; @@ -11,10 +13,12 @@ import NewChatButton from 'components/molecules/newChatButton'; import { settingsState } from 'state/settings'; +import AudioPresence from './chat/inputBox/AudioPresence'; import { OpenSideBarMobileButton } from './sidebar/OpenSideBarMobileButton'; const Header = memo(() => { const isMobile = useMediaQuery('(max-width: 66rem)'); + const { audioConnection } = useAudio(); const { isChatHistoryOpen } = useRecoilValue(settingsState); return ( @@ -36,9 +40,21 @@ const Header = memo(() => { position: 'absolute', top: '50%', left: '50%', - transform: 'translate(-50%, -50%)' + transform: 'translate(-50%, -50%)', + display: 'flex', + alignItems: 'center', + gap: 1 }} > + {audioConnection === 'on' ? ( + + ) : null} {isMobile ? ( diff --git a/frontend/src/pages/ResumeButton.tsx b/frontend/src/pages/ResumeButton.tsx index 4d3931d6c2..c5c5418514 100644 --- a/frontend/src/pages/ResumeButton.tsx +++ b/frontend/src/pages/ResumeButton.tsx @@ -11,7 +11,7 @@ import { } from '@chainlit/react-client'; import { Translator } from 'components/i18n'; -import WaterMark from 'components/organisms/chat/inputBox/waterMark'; +import InputBoxFooter from 'components/organisms/chat/inputBox/footer'; import { useLayoutMaxWidth } from 'hooks/useLayoutMaxWidth'; @@ -66,7 +66,7 @@ export default function ResumeButton({ threadId }: Props) { - + ); } diff --git a/libs/copilot/src/components/Input.tsx b/libs/copilot/src/components/Input.tsx index bcb5a5a070..96b736f09b 100644 --- a/libs/copilot/src/components/Input.tsx +++ b/libs/copilot/src/components/Input.tsx @@ -10,7 +10,7 @@ import { Attachments } from '@chainlit/app/src/components/molecules/attachments' import MicButton from '@chainlit/app/src/components/organisms/chat/inputBox/MicButton'; import { SubmitButton } from '@chainlit/app/src/components/organisms/chat/inputBox/SubmitButton'; import UploadButton from '@chainlit/app/src/components/organisms/chat/inputBox/UploadButton'; -import WaterMark from '@chainlit/app/src/components/organisms/chat/inputBox/waterMark'; +import InputBoxFooter from '@chainlit/app/src/components/organisms/chat/inputBox/footer'; import { IAttachment, attachmentsState } from '@chainlit/app/src/state/chat'; import { chatSettingsOpenState } from '@chainlit/app/src/state/project'; import { inputHistoryState } from '@chainlit/app/src/state/userInputHistory'; @@ -212,7 +212,7 @@ const Input = memo( - + diff --git a/libs/react-client/src/index.ts b/libs/react-client/src/index.ts index 42599509a4..083d74abe4 100644 --- a/libs/react-client/src/index.ts +++ b/libs/react-client/src/index.ts @@ -11,3 +11,5 @@ export * from './state'; export * from './utils/message'; export { Socket } from 'socket.io-client'; + +export { WavRenderer } from './wavtools/wav_renderer'; diff --git a/libs/react-client/src/state.ts b/libs/react-client/src/state.ts index 54ca14973c..e514daee27 100644 --- a/libs/react-client/src/state.ts +++ b/libs/react-client/src/state.ts @@ -16,6 +16,7 @@ import { ThreadHistory } from './types'; import { groupByDate } from './utils/group'; +import { WavRecorder, WavStreamPlayer } from './wavtools'; export interface ISession { socket: Socket; @@ -76,6 +77,28 @@ export const askUserState = atom({ default: undefined }); +export const wavRecorderState = atom({ + key: 'WavRecorder', + dangerouslyAllowMutability: true, + default: new WavRecorder() +}); + +export const wavStreamPlayerState = atom({ + key: 'WavStreamPlayer', + dangerouslyAllowMutability: true, + default: new WavStreamPlayer() +}); + +export const audioConnectionState = atom<'connecting' | 'on' | 'off'>({ + key: 'AudioConnection', + default: 'off' +}); + +export const isAiSpeakingState = atom({ + key: 'isAiSpeaking', + default: false +}); + export const callFnState = atom({ key: 'CallFn', default: undefined diff --git a/libs/react-client/src/types/audio.ts b/libs/react-client/src/types/audio.ts new file mode 100644 index 0000000000..e53518084e --- /dev/null +++ b/libs/react-client/src/types/audio.ts @@ -0,0 +1,5 @@ +export interface OutputAudioChunk { + track: string; + mimeType: string; + data: Int16Array; +} diff --git a/libs/react-client/src/types/config.ts b/libs/react-client/src/types/config.ts index 3ae19354d9..0184e35c4e 100644 --- a/libs/react-client/src/types/config.ts +++ b/libs/react-client/src/types/config.ts @@ -14,11 +14,7 @@ export interface ChatProfile { export interface IAudioConfig { enabled: boolean; - min_decibels: number; - initial_silence_timeout: number; - silence_timeout: number; - chunk_duration: number; - max_duration: number; + sample_rate: number; } export interface IAuthConfig { diff --git a/libs/react-client/src/useAudio.ts b/libs/react-client/src/useAudio.ts index c777a09ec6..79a964f3f2 100644 --- a/libs/react-client/src/useAudio.ts +++ b/libs/react-client/src/useAudio.ts @@ -1,196 +1,42 @@ -import { useCallback, useRef, useState } from 'react'; - -import { IAudioConfig, IFileRef } from './types'; +import { useCallback } from 'react'; +import { useRecoilState, useRecoilValue } from 'recoil'; + +import { + audioConnectionState, + isAiSpeakingState, + wavRecorderState, + wavStreamPlayerState +} from './state'; import { useChatInteract } from './useChatInteract'; -const defaultConfig: IAudioConfig = { - enabled: true, - min_decibels: -45, - initial_silence_timeout: 3000, - silence_timeout: 1500, - max_duration: 15000, - chunk_duration: 1000 -}; - -const useAudio = (config = defaultConfig) => { - const mediaRecorderRef = useRef(null); - const cancelling = useRef(false); - const { sendAudioChunk, endAudioStream } = useChatInteract(); - const [isRecording, setIsRecording] = useState(false); - const [timer, setTimer] = useState(undefined); - const [isSpeaking, setIsSpeaking] = useState(false); - const [error, setError] = useState(undefined); - const [isRecordingFinished, setIsRecordingFinished] = useState(false); - - const cancelRecording = useCallback(() => { - if (!isRecording || !mediaRecorderRef.current) { - return; - } - cancelling.current = true; - mediaRecorderRef.current.stop(); - }, [isRecording]); - - const stopRecording = useCallback(() => { - if (!isRecording || !mediaRecorderRef.current) { - return; - } - mediaRecorderRef.current.stop(); - }, [isRecording]); - - const startRecording = useCallback( - (fileReferences?: IFileRef[]) => { - if (isRecording || !config) { - return; - } - setIsRecordingFinished(false); - setError(undefined); - clearTimeout(timer); - cancelling.current = false; - - const { - min_decibels, - silence_timeout, - initial_silence_timeout, - chunk_duration, - max_duration - } = config; - - navigator.mediaDevices - .getUserMedia({ audio: true }) - .then((stream) => { - let spokeAtLeastOnce = false; - let isSpeaking = false; - let isFirstChunk = true; - let audioBuffer: Blob | null = null; - let startTime = Date.now(); - - const mediaRecorder = new MediaRecorder(stream); - mediaRecorderRef.current = mediaRecorder; - mediaRecorder.addEventListener('start', () => { - setIsRecording(true); - startTime = Date.now(); - }); - - mediaRecorder.addEventListener('dataavailable', async (event) => { - if (!spokeAtLeastOnce) { - if (!audioBuffer) { - audioBuffer = new Blob([event.data], { type: event.data.type }); - } else { - audioBuffer = new Blob([audioBuffer, event.data], { - type: event.data.type - }); - } - } - if (mediaRecorder.state === 'inactive') { - return; - } - const elapsedTime = Date.now() - startTime; - if (elapsedTime >= max_duration) { - mediaRecorder.stop(); - stream.getTracks().forEach((track) => track.stop()); - return; - } - - setIsSpeaking(isSpeaking); - const [mimeType, _] = mediaRecorder.mimeType.split(';'); - - if (audioBuffer) { - // If there is buffered data and the user has spoken, send the buffered data first - await sendAudioChunk( - isFirstChunk, - mimeType, - elapsedTime, - new Blob([audioBuffer, event.data]) - ); - audioBuffer = null; // Clear the buffer - } else { - await sendAudioChunk( - isFirstChunk, - mimeType, - elapsedTime, - event.data - ); - } - - if (isFirstChunk) { - isFirstChunk = false; - } - }); - - mediaRecorder.addEventListener('stop', async () => { - setIsRecording(false); - setIsSpeaking(false); - - if (spokeAtLeastOnce && !cancelling.current) { - setIsRecordingFinished(true); - await endAudioStream(fileReferences); - } - }); - - const audioContext = new AudioContext(); - const audioStreamSource = - audioContext.createMediaStreamSource(stream); - const analyser = audioContext.createAnalyser(); - analyser.minDecibels = min_decibels; - audioStreamSource.connect(analyser); - - const bufferLength = analyser.frequencyBinCount; - - const domainData = new Uint8Array(bufferLength); - - mediaRecorder.start(chunk_duration); +const useAudio = () => { + const [audioConnection, setAudioConnection] = + useRecoilState(audioConnectionState); + const wavRecorder = useRecoilValue(wavRecorderState); + const wavStreamPlayer = useRecoilValue(wavStreamPlayerState); + const isAiSpeaking = useRecoilValue(isAiSpeakingState); - const detectSound = () => { - if (mediaRecorder.state === 'inactive') { - return; - } - analyser.getByteFrequencyData(domainData); - const soundDetected = domainData.some((value) => value > 0); + const { startAudioStream, endAudioStream } = useChatInteract(); - if (!isSpeaking) { - isSpeaking = soundDetected; - } - if (!spokeAtLeastOnce && soundDetected) { - setIsSpeaking(isSpeaking); - spokeAtLeastOnce = true; - } - requestAnimationFrame(detectSound); - }; - detectSound(); + const startConversation = useCallback(async () => { + setAudioConnection('connecting'); + await startAudioStream(); + }, [startAudioStream]); - setTimeout(() => { - if (!spokeAtLeastOnce) { - mediaRecorder.stop(); - stream.getTracks().forEach((track) => track.stop()); - } else { - setTimer( - setInterval(() => { - if (!isSpeaking) { - mediaRecorder.stop(); - stream.getTracks().forEach((track) => track.stop()); - } else { - isSpeaking = false; - } - }, silence_timeout) - ); - } - }, initial_silence_timeout); - }) - .catch((err) => { - setError(err.message); - }); - }, - [timer, isRecording, config, sendAudioChunk, endAudioStream] - ); + const endConversation = useCallback(async () => { + setAudioConnection('off'); + await wavRecorder.end(); + await wavStreamPlayer.interrupt(); + await endAudioStream(); + }, [endAudioStream, wavRecorder, wavStreamPlayer]); return { - startRecording, - stopRecording, - cancelRecording, - isRecording, - isSpeaking, - isRecordingFinished, - error + startConversation, + endConversation, + audioConnection, + isAiSpeaking, + wavRecorder, + wavStreamPlayer }; }; diff --git a/libs/react-client/src/useChatInteract.ts b/libs/react-client/src/useChatInteract.ts index 5aeb34be3c..ab646de970 100644 --- a/libs/react-client/src/useChatInteract.ts +++ b/libs/react-client/src/useChatInteract.ts @@ -90,8 +90,17 @@ const useChatInteract = () => { [session?.socket] ); + const startAudioStream = useCallback(() => { + session?.socket.emit('audio_start'); + }, [session?.socket]); + const sendAudioChunk = useCallback( - (isStart: boolean, mimeType: string, elapsedTime: number, data: Blob) => { + ( + isStart: boolean, + mimeType: string, + elapsedTime: number, + data: Int16Array + ) => { session?.socket.emit('audio_chunk', { isStart, mimeType, @@ -102,12 +111,9 @@ const useChatInteract = () => { [session?.socket] ); - const endAudioStream = useCallback( - (fileReferences?: IFileRef[]) => { - session?.socket.emit('audio_end', { fileReferences }); - }, - [session?.socket] - ); + const endAudioStream = useCallback(() => { + session?.socket.emit('audio_end'); + }, [session?.socket]); const replyMessage = useCallback( (message: IStep) => { @@ -179,6 +185,7 @@ const useChatInteract = () => { replyMessage, sendMessage, editMessage, + startAudioStream, sendAudioChunk, endAudioStream, stopTask, diff --git a/libs/react-client/src/useChatSession.ts b/libs/react-client/src/useChatSession.ts index ffa817c586..fc1de3fbd5 100644 --- a/libs/react-client/src/useChatSession.ts +++ b/libs/react-client/src/useChatSession.ts @@ -10,6 +10,7 @@ import io from 'socket.io-client'; import { actionState, askUserState, + audioConnectionState, callFnState, chatProfileState, chatSettingsInputsState, @@ -17,13 +18,16 @@ import { currentThreadIdState, elementState, firstUserInteraction, + isAiSpeakingState, loadingState, messagesState, sessionIdState, sessionState, tasklistState, threadIdToResumeState, - tokenCountState + tokenCountState, + wavRecorderState, + wavStreamPlayerState } from 'src/state'; import { IAction, @@ -40,6 +44,8 @@ import { updateMessageContentById } from 'src/utils/message'; +import { OutputAudioChunk } from './types/audio'; + import { ChainlitContext } from './context'; import type { IToken } from './useChatData'; @@ -48,10 +54,13 @@ const useChatSession = () => { const sessionId = useRecoilValue(sessionIdState); const [session, setSession] = useRecoilState(sessionState); - + const setIsAiSpeaking = useSetRecoilState(isAiSpeakingState); + const setAudioConnection = useSetRecoilState(audioConnectionState); const resetChatSettingsValue = useResetRecoilState(chatSettingsValueState); const setFirstUserInteraction = useSetRecoilState(firstUserInteraction); const setLoading = useSetRecoilState(loadingState); + const wavStreamPlayer = useRecoilValue(wavStreamPlayerState); + const wavRecorder = useRecoilValue(wavRecorderState); const setMessages = useSetRecoilState(messagesState); const setAskUser = useSetRecoilState(askUserState); const setCallFn = useSetRecoilState(callFnState); @@ -132,6 +141,41 @@ const useChatSession = () => { window.location.reload(); }); + socket.on('audio_connection', async (state: 'on' | 'off') => { + if (state === 'on') { + let isFirstChunk = true; + const startTime = Date.now(); + const mimeType = 'pcm16'; + // Connect to microphone + await wavRecorder.begin(); + await wavStreamPlayer.connect(); + await wavRecorder.record(async (data) => { + const elapsedTime = Date.now() - startTime; + socket.emit('audio_chunk', { + isStart: isFirstChunk, + mimeType, + elapsedTime, + data: data.mono + }); + isFirstChunk = false; + }); + wavStreamPlayer.onStop = () => setIsAiSpeaking(false); + } else { + await wavRecorder.end(); + await wavStreamPlayer.interrupt(); + } + setAudioConnection(state); + }); + + socket.on('audio_chunk', (chunk: OutputAudioChunk) => { + wavStreamPlayer.add16BitPCM(chunk.data, chunk.track); + setIsAiSpeaking(true); + }); + + socket.on('audio_interrupt', () => { + wavStreamPlayer.interrupt(); + }); + socket.on('resume_thread', (thread: IThread) => { let messages: IStep[] = []; for (const step of thread.steps) { diff --git a/libs/react-client/src/wavtools/analysis/audio_analysis.js b/libs/react-client/src/wavtools/analysis/audio_analysis.js new file mode 100644 index 0000000000..eb4836904a --- /dev/null +++ b/libs/react-client/src/wavtools/analysis/audio_analysis.js @@ -0,0 +1,203 @@ +import { + noteFrequencies, + noteFrequencyLabels, + voiceFrequencies, + voiceFrequencyLabels +} from './constants.js'; + +/** + * Output of AudioAnalysis for the frequency domain of the audio + * @typedef {Object} AudioAnalysisOutputType + * @property {Float32Array} values Amplitude of this frequency between {0, 1} inclusive + * @property {number[]} frequencies Raw frequency bucket values + * @property {string[]} labels Labels for the frequency bucket values + */ + +/** + * Analyzes audio for visual output + * @class + */ +export class AudioAnalysis { + /** + * Retrieves frequency domain data from an AnalyserNode adjusted to a decibel range + * returns human-readable formatting and labels + * @param {AnalyserNode} analyser + * @param {number} sampleRate + * @param {Float32Array} [fftResult] + * @param {"frequency"|"music"|"voice"} [analysisType] + * @param {number} [minDecibels] default -100 + * @param {number} [maxDecibels] default -30 + * @returns {AudioAnalysisOutputType} + */ + static getFrequencies( + analyser, + sampleRate, + fftResult, + analysisType = 'frequency', + minDecibels = -100, + maxDecibels = -30 + ) { + if (!fftResult) { + fftResult = new Float32Array(analyser.frequencyBinCount); + analyser.getFloatFrequencyData(fftResult); + } + const nyquistFrequency = sampleRate / 2; + const frequencyStep = (1 / fftResult.length) * nyquistFrequency; + let outputValues; + let frequencies; + let labels; + if (analysisType === 'music' || analysisType === 'voice') { + const useFrequencies = + analysisType === 'voice' ? voiceFrequencies : noteFrequencies; + const aggregateOutput = Array(useFrequencies.length).fill(minDecibels); + for (let i = 0; i < fftResult.length; i++) { + const frequency = i * frequencyStep; + const amplitude = fftResult[i]; + for (let n = useFrequencies.length - 1; n >= 0; n--) { + if (frequency > useFrequencies[n]) { + aggregateOutput[n] = Math.max(aggregateOutput[n], amplitude); + break; + } + } + } + outputValues = aggregateOutput; + frequencies = + analysisType === 'voice' ? voiceFrequencies : noteFrequencies; + labels = + analysisType === 'voice' ? voiceFrequencyLabels : noteFrequencyLabels; + } else { + outputValues = Array.from(fftResult); + frequencies = outputValues.map((_, i) => frequencyStep * i); + labels = frequencies.map((f) => `${f.toFixed(2)} Hz`); + } + // We normalize to {0, 1} + const normalizedOutput = outputValues.map((v) => { + return Math.max( + 0, + Math.min((v - minDecibels) / (maxDecibels - minDecibels), 1) + ); + }); + const values = new Float32Array(normalizedOutput); + return { + values, + frequencies, + labels + }; + } + + /** + * Creates a new AudioAnalysis instance for an HTMLAudioElement + * @param {HTMLAudioElement} audioElement + * @param {AudioBuffer|null} [audioBuffer] If provided, will cache all frequency domain data from the buffer + * @returns {AudioAnalysis} + */ + constructor(audioElement, audioBuffer = null) { + this.fftResults = []; + if (audioBuffer) { + /** + * Modified from + * https://stackoverflow.com/questions/75063715/using-the-web-audio-api-to-analyze-a-song-without-playing + * + * We do this to populate FFT values for the audio if provided an `audioBuffer` + * The reason to do this is that Safari fails when using `createMediaElementSource` + * This has a non-zero RAM cost so we only opt-in to run it on Safari, Chrome is better + */ + const { length, sampleRate } = audioBuffer; + const offlineAudioContext = new OfflineAudioContext({ + length, + sampleRate + }); + const source = offlineAudioContext.createBufferSource(); + source.buffer = audioBuffer; + const analyser = offlineAudioContext.createAnalyser(); + analyser.fftSize = 8192; + analyser.smoothingTimeConstant = 0.1; + source.connect(analyser); + // limit is :: 128 / sampleRate; + // but we just want 60fps - cuts ~1s from 6MB to 1MB of RAM + const renderQuantumInSeconds = 1 / 60; + const durationInSeconds = length / sampleRate; + const analyze = (index) => { + const suspendTime = renderQuantumInSeconds * index; + if (suspendTime < durationInSeconds) { + offlineAudioContext.suspend(suspendTime).then(() => { + const fftResult = new Float32Array(analyser.frequencyBinCount); + analyser.getFloatFrequencyData(fftResult); + this.fftResults.push(fftResult); + analyze(index + 1); + }); + } + if (index === 1) { + offlineAudioContext.startRendering(); + } else { + offlineAudioContext.resume(); + } + }; + source.start(0); + analyze(1); + this.audio = audioElement; + this.context = offlineAudioContext; + this.analyser = analyser; + this.sampleRate = sampleRate; + this.audioBuffer = audioBuffer; + } else { + const audioContext = new AudioContext(); + const track = audioContext.createMediaElementSource(audioElement); + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 8192; + analyser.smoothingTimeConstant = 0.1; + track.connect(analyser); + analyser.connect(audioContext.destination); + this.audio = audioElement; + this.context = audioContext; + this.analyser = analyser; + this.sampleRate = this.context.sampleRate; + this.audioBuffer = null; + } + } + + /** + * Gets the current frequency domain data from the playing audio track + * @param {"frequency"|"music"|"voice"} [analysisType] + * @param {number} [minDecibels] default -100 + * @param {number} [maxDecibels] default -30 + * @returns {AudioAnalysisOutputType} + */ + getFrequencies( + analysisType = 'frequency', + minDecibels = -100, + maxDecibels = -30 + ) { + let fftResult = null; + if (this.audioBuffer && this.fftResults.length) { + const pct = this.audio.currentTime / this.audio.duration; + const index = Math.min( + (pct * this.fftResults.length) | 0, + this.fftResults.length - 1 + ); + fftResult = this.fftResults[index]; + } + return AudioAnalysis.getFrequencies( + this.analyser, + this.sampleRate, + fftResult, + analysisType, + minDecibels, + maxDecibels + ); + } + + /** + * Resume the internal AudioContext if it was suspended due to the lack of + * user interaction when the AudioAnalysis was instantiated. + * @returns {Promise} + */ + async resumeIfSuspended() { + if (this.context.state === 'suspended') { + await this.context.resume(); + } + return true; + } +} + +globalThis.AudioAnalysis = AudioAnalysis; diff --git a/libs/react-client/src/wavtools/analysis/constants.js b/libs/react-client/src/wavtools/analysis/constants.js new file mode 100644 index 0000000000..4c57e557de --- /dev/null +++ b/libs/react-client/src/wavtools/analysis/constants.js @@ -0,0 +1,60 @@ +/** + * Constants for help with visualization + * Helps map frequency ranges from Fast Fourier Transform + * to human-interpretable ranges, notably music ranges and + * human vocal ranges. + */ + +// Eighth octave frequencies +const octave8Frequencies = [ + 4186.01, 4434.92, 4698.63, 4978.03, 5274.04, 5587.65, 5919.91, 6271.93, + 6644.88, 7040.0, 7458.62, 7902.13 +]; + +// Labels for each of the above frequencies +const octave8FrequencyLabels = [ + 'C', + 'C#', + 'D', + 'D#', + 'E', + 'F', + 'F#', + 'G', + 'G#', + 'A', + 'A#', + 'B' +]; + +/** + * All note frequencies from 1st to 8th octave + * in format "A#8" (A#, 8th octave) + */ +export const noteFrequencies = []; +export const noteFrequencyLabels = []; +for (let i = 1; i <= 8; i++) { + for (let f = 0; f < octave8Frequencies.length; f++) { + const freq = octave8Frequencies[f]; + noteFrequencies.push(freq / Math.pow(2, 8 - i)); + noteFrequencyLabels.push(octave8FrequencyLabels[f] + i); + } +} + +/** + * Subset of the note frequencies between 32 and 2000 Hz + * 6 octave range: C1 to B6 + */ +const voiceFrequencyRange = [32.0, 2000.0]; +export const voiceFrequencies = noteFrequencies.filter((_, i) => { + return ( + noteFrequencies[i] > voiceFrequencyRange[0] && + noteFrequencies[i] < voiceFrequencyRange[1] + ); +}); +export const voiceFrequencyLabels = noteFrequencyLabels.filter((_, i) => { + return ( + noteFrequencies[i] > voiceFrequencyRange[0] && + noteFrequencies[i] < voiceFrequencyRange[1] + ); +}); diff --git a/libs/react-client/src/wavtools/index.ts b/libs/react-client/src/wavtools/index.ts new file mode 100644 index 0000000000..62bd593449 --- /dev/null +++ b/libs/react-client/src/wavtools/index.ts @@ -0,0 +1,7 @@ +// Courtesy of https://github.com/openai/openai-realtime-console +import { AudioAnalysis } from './analysis/audio_analysis.js'; +import { WavPacker } from './wav_packer.js'; +import { WavRecorder } from './wav_recorder.js'; +import { WavStreamPlayer } from './wav_stream_player.js'; + +export { AudioAnalysis, WavPacker, WavStreamPlayer, WavRecorder }; diff --git a/libs/react-client/src/wavtools/wav_packer.js b/libs/react-client/src/wavtools/wav_packer.js new file mode 100644 index 0000000000..1e11780bbc --- /dev/null +++ b/libs/react-client/src/wavtools/wav_packer.js @@ -0,0 +1,113 @@ +/** + * Raw wav audio file contents + * @typedef {Object} WavPackerAudioType + * @property {Blob} blob + * @property {string} url + * @property {number} channelCount + * @property {number} sampleRate + * @property {number} duration + */ + +/** + * Utility class for assembling PCM16 "audio/wav" data + * @class + */ +export class WavPacker { + /** + * Converts Float32Array of amplitude data to ArrayBuffer in Int16Array format + * @param {Float32Array} float32Array + * @returns {ArrayBuffer} + */ + static floatTo16BitPCM(float32Array) { + const buffer = new ArrayBuffer(float32Array.length * 2); + const view = new DataView(buffer); + let offset = 0; + for (let i = 0; i < float32Array.length; i++, offset += 2) { + let s = Math.max(-1, Math.min(1, float32Array[i])); + view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); + } + return buffer; + } + + /** + * Concatenates two ArrayBuffers + * @param {ArrayBuffer} leftBuffer + * @param {ArrayBuffer} rightBuffer + * @returns {ArrayBuffer} + */ + static mergeBuffers(leftBuffer, rightBuffer) { + const tmpArray = new Uint8Array( + leftBuffer.byteLength + rightBuffer.byteLength + ); + tmpArray.set(new Uint8Array(leftBuffer), 0); + tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength); + return tmpArray.buffer; + } + + /** + * Packs data into an Int16 format + * @private + * @param {number} size 0 = 1x Int16, 1 = 2x Int16 + * @param {number} arg value to pack + * @returns + */ + _packData(size, arg) { + return [ + new Uint8Array([arg, arg >> 8]), + new Uint8Array([arg, arg >> 8, arg >> 16, arg >> 24]) + ][size]; + } + + /** + * Packs audio into "audio/wav" Blob + * @param {number} sampleRate + * @param {{bitsPerSample: number, channels: Array, data: Int16Array}} audio + * @returns {WavPackerAudioType} + */ + pack(sampleRate, audio) { + if (!audio?.bitsPerSample) { + throw new Error(`Missing "bitsPerSample"`); + } else if (!audio?.channels) { + throw new Error(`Missing "channels"`); + } else if (!audio?.data) { + throw new Error(`Missing "data"`); + } + const { bitsPerSample, channels, data } = audio; + const output = [ + // Header + 'RIFF', + this._packData( + 1, + 4 + (8 + 24) /* chunk 1 length */ + (8 + 8) /* chunk 2 length */ + ), // Length + 'WAVE', + // chunk 1 + 'fmt ', // Sub-chunk identifier + this._packData(1, 16), // Chunk length + this._packData(0, 1), // Audio format (1 is linear quantization) + this._packData(0, channels.length), + this._packData(1, sampleRate), + this._packData(1, (sampleRate * channels.length * bitsPerSample) / 8), // Byte rate + this._packData(0, (channels.length * bitsPerSample) / 8), + this._packData(0, bitsPerSample), + // chunk 2 + 'data', // Sub-chunk identifier + this._packData( + 1, + (channels[0].length * channels.length * bitsPerSample) / 8 + ), // Chunk length + data + ]; + const blob = new Blob(output, { type: 'audio/mpeg' }); + const url = URL.createObjectURL(blob); + return { + blob, + url, + channelCount: channels.length, + sampleRate, + duration: data.byteLength / (channels.length * sampleRate * 2) + }; + } +} + +globalThis.WavPacker = WavPacker; diff --git a/libs/react-client/src/wavtools/wav_recorder.js b/libs/react-client/src/wavtools/wav_recorder.js new file mode 100644 index 0000000000..9b36a205d0 --- /dev/null +++ b/libs/react-client/src/wavtools/wav_recorder.js @@ -0,0 +1,548 @@ +import { AudioAnalysis } from './analysis/audio_analysis.js'; +import { WavPacker } from './wav_packer.js'; +import { AudioProcessorSrc } from './worklets/audio_processor.js'; + +/** + * Decodes audio into a wav file + * @typedef {Object} DecodedAudioType + * @property {Blob} blob + * @property {string} url + * @property {Float32Array} values + * @property {AudioBuffer} audioBuffer + */ + +/** + * Records live stream of user audio as PCM16 "audio/wav" data + * @class + */ +export class WavRecorder { + /** + * Create a new WavRecorder instance + * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options] + * @returns {WavRecorder} + */ + constructor({ + sampleRate = 24000, + outputToSpeakers = false, + debug = false + } = {}) { + // Script source + this.scriptSrc = AudioProcessorSrc; + // Config + this.sampleRate = sampleRate; + this.outputToSpeakers = outputToSpeakers; + this.debug = !!debug; + this._deviceChangeCallback = null; + this._devices = []; + // State variables + this.stream = null; + this.processor = null; + this.source = null; + this.node = null; + this.recording = false; + // Event handling with AudioWorklet + this._lastEventId = 0; + this.eventReceipts = {}; + this.eventTimeout = 5000; + // Process chunks of audio + this._chunkProcessor = () => {}; + this._chunkProcessorSize = void 0; + this._chunkProcessorBuffer = { + raw: new ArrayBuffer(0), + mono: new ArrayBuffer(0) + }; + } + + /** + * Decodes audio data from multiple formats to a Blob, url, Float32Array and AudioBuffer + * @param {Blob|Float32Array|Int16Array|ArrayBuffer|number[]} audioData + * @param {number} sampleRate + * @param {number} fromSampleRate + * @returns {Promise} + */ + static async decode(audioData, sampleRate = 24000, fromSampleRate = -1) { + const context = new AudioContext({ sampleRate }); + let arrayBuffer; + let blob; + if (audioData instanceof Blob) { + if (fromSampleRate !== -1) { + throw new Error( + `Can not specify "fromSampleRate" when reading from Blob` + ); + } + blob = audioData; + arrayBuffer = await blob.arrayBuffer(); + } else if (audioData instanceof ArrayBuffer) { + if (fromSampleRate !== -1) { + throw new Error( + `Can not specify "fromSampleRate" when reading from ArrayBuffer` + ); + } + arrayBuffer = audioData; + blob = new Blob([arrayBuffer], { type: 'audio/wav' }); + } else { + let float32Array; + let data; + if (audioData instanceof Int16Array) { + data = audioData; + float32Array = new Float32Array(audioData.length); + for (let i = 0; i < audioData.length; i++) { + float32Array[i] = audioData[i] / 0x8000; + } + } else if (audioData instanceof Float32Array) { + float32Array = audioData; + } else if (audioData instanceof Array) { + float32Array = new Float32Array(audioData); + } else { + throw new Error( + `"audioData" must be one of: Blob, Float32Arrray, Int16Array, ArrayBuffer, Array` + ); + } + if (fromSampleRate === -1) { + throw new Error( + `Must specify "fromSampleRate" when reading from Float32Array, In16Array or Array` + ); + } else if (fromSampleRate < 3000) { + throw new Error(`Minimum "fromSampleRate" is 3000 (3kHz)`); + } + if (!data) { + data = WavPacker.floatTo16BitPCM(float32Array); + } + const audio = { + bitsPerSample: 16, + channels: [float32Array], + data + }; + const packer = new WavPacker(); + const result = packer.pack(fromSampleRate, audio); + blob = result.blob; + arrayBuffer = await blob.arrayBuffer(); + } + const audioBuffer = await context.decodeAudioData(arrayBuffer); + const values = audioBuffer.getChannelData(0); + const url = URL.createObjectURL(blob); + return { + blob, + url, + values, + audioBuffer + }; + } + + /** + * Logs data in debug mode + * @param {...any} arguments + * @returns {true} + */ + log() { + if (this.debug) { + this.log(...arguments); + } + return true; + } + + /** + * Retrieves the current sampleRate for the recorder + * @returns {number} + */ + getSampleRate() { + return this.sampleRate; + } + + /** + * Retrieves the current status of the recording + * @returns {"ended"|"paused"|"recording"} + */ + getStatus() { + if (!this.processor) { + return 'ended'; + } else if (!this.recording) { + return 'paused'; + } else { + return 'recording'; + } + } + + /** + * Sends an event to the AudioWorklet + * @private + * @param {string} name + * @param {{[key: string]: any}} data + * @param {AudioWorkletNode} [_processor] + * @returns {Promise<{[key: string]: any}>} + */ + async _event(name, data = {}, _processor = null) { + _processor = _processor || this.processor; + if (!_processor) { + throw new Error('Can not send events without recording first'); + } + const message = { + event: name, + id: this._lastEventId++, + data + }; + _processor.port.postMessage(message); + const t0 = new Date().valueOf(); + while (!this.eventReceipts[message.id]) { + if (new Date().valueOf() - t0 > this.eventTimeout) { + throw new Error(`Timeout waiting for "${name}" event`); + } + await new Promise((res) => setTimeout(() => res(true), 1)); + } + const payload = this.eventReceipts[message.id]; + delete this.eventReceipts[message.id]; + return payload; + } + + /** + * Sets device change callback, remove if callback provided is `null` + * @param {(Array): void|null} callback + * @returns {true} + */ + listenForDeviceChange(callback) { + if (callback === null && this._deviceChangeCallback) { + navigator.mediaDevices.removeEventListener( + 'devicechange', + this._deviceChangeCallback + ); + this._deviceChangeCallback = null; + } else if (callback !== null) { + // Basically a debounce; we only want this called once when devices change + // And we only want the most recent callback() to be executed + // if a few are operating at the same time + let lastId = 0; + let lastDevices = []; + const serializeDevices = (devices) => + devices + .map((d) => d.deviceId) + .sort() + .join(','); + const cb = async () => { + let id = ++lastId; + const devices = await this.listDevices(); + if (id === lastId) { + if (serializeDevices(lastDevices) !== serializeDevices(devices)) { + lastDevices = devices; + callback(devices.slice()); + } + } + }; + navigator.mediaDevices.addEventListener('devicechange', cb); + cb(); + this._deviceChangeCallback = cb; + } + return true; + } + + /** + * Manually request permission to use the microphone + * @returns {Promise} + */ + async requestPermission() { + const permissionStatus = await navigator.permissions.query({ + name: 'microphone' + }); + if (permissionStatus.state === 'denied') { + window.alert('You must grant microphone access to use this feature.'); + } else if (permissionStatus.state === 'prompt') { + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: true + }); + const tracks = stream.getTracks(); + tracks.forEach((track) => track.stop()); + } catch (e) { + window.alert('You must grant microphone access to use this feature.'); + } + } + return true; + } + + /** + * List all eligible devices for recording, will request permission to use microphone + * @returns {Promise>} + */ + async listDevices() { + if ( + !navigator.mediaDevices || + !('enumerateDevices' in navigator.mediaDevices) + ) { + throw new Error('Could not request user devices'); + } + await this.requestPermission(); + const devices = await navigator.mediaDevices.enumerateDevices(); + const audioDevices = devices.filter( + (device) => device.kind === 'audioinput' + ); + const defaultDeviceIndex = audioDevices.findIndex( + (device) => device.deviceId === 'default' + ); + const deviceList = []; + if (defaultDeviceIndex !== -1) { + let defaultDevice = audioDevices.splice(defaultDeviceIndex, 1)[0]; + let existingIndex = audioDevices.findIndex( + (device) => device.groupId === defaultDevice.groupId + ); + if (existingIndex !== -1) { + defaultDevice = audioDevices.splice(existingIndex, 1)[0]; + } + defaultDevice.default = true; + deviceList.push(defaultDevice); + } + return deviceList.concat(audioDevices); + } + + /** + * Begins a recording session and requests microphone permissions if not already granted + * Microphone recording indicator will appear on browser tab but status will be "paused" + * @param {string} [deviceId] if no device provided, default device will be used + * @returns {Promise} + */ + async begin(deviceId) { + if (this.processor) { + throw new Error( + `Already connected: please call .end() to start a new session` + ); + } + + if ( + !navigator.mediaDevices || + !('getUserMedia' in navigator.mediaDevices) + ) { + throw new Error('Could not request user media'); + } + try { + const config = { audio: true }; + if (deviceId) { + config.audio = { deviceId: { exact: deviceId } }; + } + this.stream = await navigator.mediaDevices.getUserMedia(config); + } catch (err) { + throw new Error('Could not start media stream'); + } + + const context = new AudioContext({ sampleRate: this.sampleRate }); + const source = context.createMediaStreamSource(this.stream); + // Load and execute the module script. + try { + await context.audioWorklet.addModule(this.scriptSrc); + } catch (e) { + console.error(e); + throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); + } + const processor = new AudioWorkletNode(context, 'audio_processor'); + processor.port.onmessage = (e) => { + const { event, id, data } = e.data; + if (event === 'receipt') { + this.eventReceipts[id] = data; + } else if (event === 'chunk') { + if (this._chunkProcessorSize) { + const buffer = this._chunkProcessorBuffer; + this._chunkProcessorBuffer = { + raw: WavPacker.mergeBuffers(buffer.raw, data.raw), + mono: WavPacker.mergeBuffers(buffer.mono, data.mono) + }; + if ( + this._chunkProcessorBuffer.mono.byteLength >= + this._chunkProcessorSize + ) { + this._chunkProcessor(this._chunkProcessorBuffer); + this._chunkProcessorBuffer = { + raw: new ArrayBuffer(0), + mono: new ArrayBuffer(0) + }; + } + } else { + this._chunkProcessor(data); + } + } + }; + + const node = source.connect(processor); + const analyser = context.createAnalyser(); + analyser.fftSize = 8192; + analyser.smoothingTimeConstant = 0.1; + node.connect(analyser); + if (this.outputToSpeakers) { + // eslint-disable-next-line no-console + console.warn( + 'Warning: Output to speakers may affect sound quality,\n' + + 'especially due to system audio feedback preventative measures.\n' + + 'use only for debugging' + ); + analyser.connect(context.destination); + } + + this.source = source; + this.node = node; + this.analyser = analyser; + this.processor = processor; + return true; + } + + /** + * Gets the current frequency domain data from the recording track + * @param {"frequency"|"music"|"voice"} [analysisType] + * @param {number} [minDecibels] default -100 + * @param {number} [maxDecibels] default -30 + * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} + */ + getFrequencies( + analysisType = 'frequency', + minDecibels = -100, + maxDecibels = -30 + ) { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } + return AudioAnalysis.getFrequencies( + this.analyser, + this.sampleRate, + null, + analysisType, + minDecibels, + maxDecibels + ); + } + + /** + * Pauses the recording + * Keeps microphone stream open but halts storage of audio + * @returns {Promise} + */ + async pause() { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } else if (!this.recording) { + throw new Error('Already paused: please call .record() first'); + } + if (this._chunkProcessorBuffer.raw.byteLength) { + this._chunkProcessor(this._chunkProcessorBuffer); + } + this.log('Pausing ...'); + await this._event('stop'); + this.recording = false; + return true; + } + + /** + * Start recording stream and storing to memory from the connected audio source + * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor] + * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio + * @returns {Promise} + */ + async record(chunkProcessor = () => {}, chunkSize = 8192) { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } else if (this.recording) { + throw new Error('Already recording: please call .pause() first'); + } else if (typeof chunkProcessor !== 'function') { + throw new Error(`chunkProcessor must be a function`); + } + this._chunkProcessor = chunkProcessor; + this._chunkProcessorSize = chunkSize; + this._chunkProcessorBuffer = { + raw: new ArrayBuffer(0), + mono: new ArrayBuffer(0) + }; + this.log('Recording ...'); + await this._event('start'); + this.recording = true; + return true; + } + + /** + * Clears the audio buffer, empties stored recording + * @returns {Promise} + */ + async clear() { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } + await this._event('clear'); + return true; + } + + /** + * Reads the current audio stream data + * @returns {Promise<{meanValues: Float32Array, channels: Array}>} + */ + async read() { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } + this.log('Reading ...'); + const result = await this._event('read'); + return result; + } + + /** + * Saves the current audio stream to a file + * @param {boolean} [force] Force saving while still recording + * @returns {Promise} + */ + async save(force = false) { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } + if (!force && this.recording) { + throw new Error( + 'Currently recording: please call .pause() first, or call .save(true) to force' + ); + } + this.log('Exporting ...'); + const exportData = await this._event('export'); + const packer = new WavPacker(); + const result = packer.pack(this.sampleRate, exportData.audio); + return result; + } + + /** + * Ends the current recording session and saves the result + * @returns {Promise} + */ + async end() { + if (!this.processor) { + throw new Error('Session ended: please call .begin() first'); + } + + const _processor = this.processor; + + this.log('Stopping ...'); + await this._event('stop'); + this.recording = false; + const tracks = this.stream.getTracks(); + tracks.forEach((track) => track.stop()); + + this.log('Exporting ...'); + const exportData = await this._event('export', {}, _processor); + + this.processor.disconnect(); + this.source.disconnect(); + this.node.disconnect(); + this.analyser.disconnect(); + this.stream = null; + this.processor = null; + this.source = null; + this.node = null; + + const packer = new WavPacker(); + const result = packer.pack(this.sampleRate, exportData.audio); + return result; + } + + /** + * Performs a full cleanup of WavRecorder instance + * Stops actively listening via microphone and removes existing listeners + * @returns {Promise} + */ + async quit() { + this.listenForDeviceChange(null); + if (this.processor) { + await this.end(); + } + return true; + } +} + +globalThis.WavRecorder = WavRecorder; diff --git a/libs/react-client/src/wavtools/wav_renderer.ts b/libs/react-client/src/wavtools/wav_renderer.ts new file mode 100644 index 0000000000..5e556db325 --- /dev/null +++ b/libs/react-client/src/wavtools/wav_renderer.ts @@ -0,0 +1,132 @@ +const dataMap = new WeakMap(); + +/** + * Normalizes a Float32Array to Array(m): We use this to draw amplitudes on a graph + * If we're rendering the same audio data, then we'll often be using + * the same (data, m, downsamplePeaks) triplets so we give option to memoize + */ +const normalizeArray = ( + data: Float32Array, + m: number, + downsamplePeaks: boolean = false, + memoize: boolean = false +) => { + let cache, mKey, dKey; + if (memoize) { + mKey = m.toString(); + dKey = downsamplePeaks.toString(); + cache = dataMap.has(data) ? dataMap.get(data) : {}; + dataMap.set(data, cache); + cache[mKey] = cache[mKey] || {}; + if (cache[mKey][dKey]) { + return cache[mKey][dKey]; + } + } + const n = data.length; + const result = new Array(m); + if (m <= n) { + // Downsampling + result.fill(0); + const count = new Array(m).fill(0); + for (let i = 0; i < n; i++) { + const index = Math.floor(i * (m / n)); + if (downsamplePeaks) { + // take highest result in the set + result[index] = Math.max(result[index], Math.abs(data[i])); + } else { + result[index] += Math.abs(data[i]); + } + count[index]++; + } + if (!downsamplePeaks) { + for (let i = 0; i < result.length; i++) { + result[i] = result[i] / count[i]; + } + } + } else { + for (let i = 0; i < m; i++) { + const index = (i * (n - 1)) / (m - 1); + const low = Math.floor(index); + const high = Math.ceil(index); + const t = index - low; + if (high >= n) { + result[i] = data[n - 1]; + } else { + result[i] = data[low] * (1 - t) + data[high] * t; + } + } + } + if (memoize) { + cache[mKey as string][dKey as string] = result; + } + return result; +}; + +export const WavRenderer = { + /** + * Renders a point-in-time snapshot of an audio sample, usually frequency values + * @param ctx + * @param data + * @param color + * @param cssWidth + * @param cssHeight + * @param pointCount number of bars to render + * @param barWidth width of bars in px + * @param barSpacing spacing between bars in px + * @param center vertically center the bars + */ + drawBars: ( + ctx: CanvasRenderingContext2D, + data: Float32Array, + cssWidth: number, + cssHeight: number, + color: string, + pointCount: number = 0, + barWidth: number = 0, + barSpacing: number = 0, + center: boolean = false + ) => { + pointCount = Math.floor( + Math.min( + pointCount, + (cssWidth - barSpacing) / (Math.max(barWidth, 1) + barSpacing) + ) + ); + if (!pointCount) { + pointCount = Math.floor( + (cssWidth - barSpacing) / (Math.max(barWidth, 1) + barSpacing) + ); + } + if (!barWidth) { + barWidth = (cssWidth - barSpacing) / pointCount - barSpacing; + } + const points = normalizeArray(data, pointCount, true); + for (let i = 0; i < pointCount; i++) { + const amplitude = Math.abs(points[i]); + const height = Math.max(1, amplitude * cssHeight); + const x = barSpacing + i * (barWidth + barSpacing); + const y = center ? (cssHeight - height) / 2 : cssHeight - height; + const radius = Math.min(barWidth / 2, height / 2); // Calculate the radius for rounded corners + + ctx.fillStyle = color; + ctx.beginPath(); + ctx.moveTo(x + radius, y); + ctx.lineTo(x + barWidth - radius, y); + ctx.arcTo(x + barWidth, y, x + barWidth, y + radius, radius); + ctx.lineTo(x + barWidth, y + height - radius); + ctx.arcTo( + x + barWidth, + y + height, + x + barWidth - radius, + y + height, + radius + ); + ctx.lineTo(x + radius, y + height); + ctx.arcTo(x, y + height, x, y + height - radius, radius); + ctx.lineTo(x, y + radius); + ctx.arcTo(x, y, x + radius, y, radius); + ctx.closePath(); + ctx.fill(); + } + } +}; diff --git a/libs/react-client/src/wavtools/wav_stream_player.js b/libs/react-client/src/wavtools/wav_stream_player.js new file mode 100644 index 0000000000..6cbd061539 --- /dev/null +++ b/libs/react-client/src/wavtools/wav_stream_player.js @@ -0,0 +1,162 @@ +import { AudioAnalysis } from './analysis/audio_analysis.js'; +import { StreamProcessorSrc } from './worklets/stream_processor.js'; + +/** + * Plays audio streams received in raw PCM16 chunks from the browser + * @class + */ +export class WavStreamPlayer { + /** + * Creates a new WavStreamPlayer instance + * @param {{sampleRate?: number}} options + * @returns {WavStreamPlayer} + */ + constructor({ sampleRate = 24000, onStop } = {}) { + this.scriptSrc = StreamProcessorSrc; + this.onStop = onStop; + this.sampleRate = sampleRate; + this.context = null; + this.stream = null; + this.analyser = null; + this.trackSampleOffsets = {}; + this.interruptedTrackIds = {}; + } + + /** + * Connects the audio context and enables output to speakers + * @returns {Promise} + */ + async connect() { + this.context = new AudioContext({ sampleRate: this.sampleRate }); + if (this.context.state === 'suspended') { + await this.context.resume(); + } + try { + await this.context.audioWorklet.addModule(this.scriptSrc); + } catch (e) { + console.error(e); + throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); + } + const analyser = this.context.createAnalyser(); + analyser.fftSize = 8192; + analyser.smoothingTimeConstant = 0.1; + this.analyser = analyser; + return true; + } + + /** + * Gets the current frequency domain data from the playing track + * @param {"frequency"|"music"|"voice"} [analysisType] + * @param {number} [minDecibels] default -100 + * @param {number} [maxDecibels] default -30 + * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} + */ + getFrequencies( + analysisType = 'frequency', + minDecibels = -100, + maxDecibels = -30 + ) { + if (!this.analyser) { + throw new Error('Not connected, please call .connect() first'); + } + return AudioAnalysis.getFrequencies( + this.analyser, + this.sampleRate, + null, + analysisType, + minDecibels, + maxDecibels + ); + } + + /** + * Starts audio streaming + * @private + * @returns {Promise} + */ + _start() { + const streamNode = new AudioWorkletNode(this.context, 'stream_processor'); + streamNode.connect(this.context.destination); + streamNode.port.onmessage = (e) => { + const { event } = e.data; + if (event === 'stop') { + this.onStop?.(); + streamNode.disconnect(); + this.stream = null; + } else if (event === 'offset') { + const { requestId, trackId, offset } = e.data; + const currentTime = offset / this.sampleRate; + this.trackSampleOffsets[requestId] = { trackId, offset, currentTime }; + } + }; + this.analyser.disconnect(); + streamNode.connect(this.analyser); + this.stream = streamNode; + return true; + } + + /** + * Adds 16BitPCM data to the currently playing audio stream + * You can add chunks beyond the current play point and they will be queued for play + * @param {ArrayBuffer|Int16Array} arrayBuffer + * @param {string} [trackId] + * @returns {Int16Array} + */ + add16BitPCM(arrayBuffer, trackId = 'default') { + if (typeof trackId !== 'string') { + throw new Error(`trackId must be a string`); + } else if (this.interruptedTrackIds[trackId]) { + return; + } + if (!this.stream) { + this._start(); + } + let buffer; + if (arrayBuffer instanceof Int16Array) { + buffer = arrayBuffer; + } else if (arrayBuffer instanceof ArrayBuffer) { + buffer = new Int16Array(arrayBuffer); + } else { + throw new Error(`argument must be Int16Array or ArrayBuffer`); + } + this.stream.port.postMessage({ event: 'write', buffer, trackId }); + return buffer; + } + + /** + * Gets the offset (sample count) of the currently playing stream + * @param {boolean} [interrupt] + * @returns {{trackId: string|null, offset: number, currentTime: number}} + */ + async getTrackSampleOffset(interrupt = false) { + if (!this.stream) { + return null; + } + const requestId = crypto.randomUUID(); + this.stream.port.postMessage({ + event: interrupt ? 'interrupt' : 'offset', + requestId + }); + let trackSampleOffset; + while (!trackSampleOffset) { + trackSampleOffset = this.trackSampleOffsets[requestId]; + await new Promise((r) => setTimeout(() => r(), 1)); + } + const { trackId } = trackSampleOffset; + if (interrupt && trackId) { + this.interruptedTrackIds[trackId] = true; + } + return trackSampleOffset; + } + + /** + * Strips the current stream and returns the sample offset of the audio + * @param {boolean} [interrupt] + * @returns {{trackId: string|null, offset: number, currentTime: number}} + */ + async interrupt() { + return this.getTrackSampleOffset(true); + } +} + +globalThis.WavStreamPlayer = WavStreamPlayer; diff --git a/libs/react-client/src/wavtools/worklets/audio_processor.js b/libs/react-client/src/wavtools/worklets/audio_processor.js new file mode 100644 index 0000000000..66ce005841 --- /dev/null +++ b/libs/react-client/src/wavtools/worklets/audio_processor.js @@ -0,0 +1,214 @@ +const AudioProcessorWorklet = ` +class AudioProcessor extends AudioWorkletProcessor { + + constructor() { + super(); + this.port.onmessage = this.receive.bind(this); + this.initialize(); + } + + initialize() { + this.foundAudio = false; + this.recording = false; + this.chunks = []; + } + + /** + * Concatenates sampled chunks into channels + * Format is chunk[Left[], Right[]] + */ + readChannelData(chunks, channel = -1, maxChannels = 9) { + let channelLimit; + if (channel !== -1) { + if (chunks[0] && chunks[0].length - 1 < channel) { + throw new Error( + \`Channel \${channel} out of range: max \${chunks[0].length}\` + ); + } + channelLimit = channel + 1; + } else { + channel = 0; + channelLimit = Math.min(chunks[0] ? chunks[0].length : 1, maxChannels); + } + const channels = []; + for (let n = channel; n < channelLimit; n++) { + const length = chunks.reduce((sum, chunk) => { + return sum + chunk[n].length; + }, 0); + const buffers = chunks.map((chunk) => chunk[n]); + const result = new Float32Array(length); + let offset = 0; + for (let i = 0; i < buffers.length; i++) { + result.set(buffers[i], offset); + offset += buffers[i].length; + } + channels[n] = result; + } + return channels; + } + + /** + * Combines parallel audio data into correct format, + * channels[Left[], Right[]] to float32Array[LRLRLRLR...] + */ + formatAudioData(channels) { + if (channels.length === 1) { + // Simple case is only one channel + const float32Array = channels[0].slice(); + const meanValues = channels[0].slice(); + return { float32Array, meanValues }; + } else { + const float32Array = new Float32Array( + channels[0].length * channels.length + ); + const meanValues = new Float32Array(channels[0].length); + for (let i = 0; i < channels[0].length; i++) { + const offset = i * channels.length; + let meanValue = 0; + for (let n = 0; n < channels.length; n++) { + float32Array[offset + n] = channels[n][i]; + meanValue += channels[n][i]; + } + meanValues[i] = meanValue / channels.length; + } + return { float32Array, meanValues }; + } + } + + /** + * Converts 32-bit float data to 16-bit integers + */ + floatTo16BitPCM(float32Array) { + const buffer = new ArrayBuffer(float32Array.length * 2); + const view = new DataView(buffer); + let offset = 0; + for (let i = 0; i < float32Array.length; i++, offset += 2) { + let s = Math.max(-1, Math.min(1, float32Array[i])); + view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); + } + return buffer; + } + + /** + * Retrieves the most recent amplitude values from the audio stream + * @param {number} channel + */ + getValues(channel = -1) { + const channels = this.readChannelData(this.chunks, channel); + const { meanValues } = this.formatAudioData(channels); + return { meanValues, channels }; + } + + /** + * Exports chunks as an audio/wav file + */ + export() { + const channels = this.readChannelData(this.chunks); + const { float32Array, meanValues } = this.formatAudioData(channels); + const audioData = this.floatTo16BitPCM(float32Array); + return { + meanValues: meanValues, + audio: { + bitsPerSample: 16, + channels: channels, + data: audioData, + }, + }; + } + + receive(e) { + const { event, id } = e.data; + let receiptData = {}; + switch (event) { + case 'start': + this.recording = true; + break; + case 'stop': + this.recording = false; + break; + case 'clear': + this.initialize(); + break; + case 'export': + receiptData = this.export(); + break; + case 'read': + receiptData = this.getValues(); + break; + default: + break; + } + // Always send back receipt + this.port.postMessage({ event: 'receipt', id, data: receiptData }); + } + + sendChunk(chunk) { + const channels = this.readChannelData([chunk]); + const { float32Array, meanValues } = this.formatAudioData(channels); + const rawAudioData = this.floatTo16BitPCM(float32Array); + const monoAudioData = this.floatTo16BitPCM(meanValues); + this.port.postMessage({ + event: 'chunk', + data: { + mono: monoAudioData, + raw: rawAudioData, + }, + }); + } + + process(inputList, outputList, parameters) { + // Copy input to output (e.g. speakers) + // Note that this creates choppy sounds with Mac products + const sourceLimit = Math.min(inputList.length, outputList.length); + for (let inputNum = 0; inputNum < sourceLimit; inputNum++) { + const input = inputList[inputNum]; + const output = outputList[inputNum]; + const channelCount = Math.min(input.length, output.length); + for (let channelNum = 0; channelNum < channelCount; channelNum++) { + input[channelNum].forEach((sample, i) => { + output[channelNum][i] = sample; + }); + } + } + const inputs = inputList[0]; + // There's latency at the beginning of a stream before recording starts + // Make sure we actually receive audio data before we start storing chunks + let sliceIndex = 0; + if (!this.foundAudio) { + for (const channel of inputs) { + sliceIndex = 0; // reset for each channel + if (this.foundAudio) { + break; + } + if (channel) { + for (const value of channel) { + if (value !== 0) { + // find only one non-zero entry in any channel + this.foundAudio = true; + break; + } else { + sliceIndex++; + } + } + } + } + } + if (inputs && inputs[0] && this.foundAudio && this.recording) { + // We need to copy the TypedArray, because the \`process\` + // internals will reuse the same buffer to hold each input + const chunk = inputs.map((input) => input.slice(sliceIndex)); + this.chunks.push(chunk); + this.sendChunk(chunk); + } + return true; + } +} + +registerProcessor('audio_processor', AudioProcessor); +`; + +const script = new Blob([AudioProcessorWorklet], { + type: 'application/javascript' +}); +const src = URL.createObjectURL(script); +export const AudioProcessorSrc = src; diff --git a/libs/react-client/src/wavtools/worklets/stream_processor.js b/libs/react-client/src/wavtools/worklets/stream_processor.js new file mode 100644 index 0000000000..b594bc511c --- /dev/null +++ b/libs/react-client/src/wavtools/worklets/stream_processor.js @@ -0,0 +1,96 @@ +export const StreamProcessorWorklet = ` +class StreamProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.hasStarted = false; + this.hasInterrupted = false; + this.outputBuffers = []; + this.bufferLength = 128; + this.write = { buffer: new Float32Array(this.bufferLength), trackId: null }; + this.writeOffset = 0; + this.trackSampleOffsets = {}; + this.port.onmessage = (event) => { + if (event.data) { + const payload = event.data; + if (payload.event === 'write') { + const int16Array = payload.buffer; + const float32Array = new Float32Array(int16Array.length); + for (let i = 0; i < int16Array.length; i++) { + float32Array[i] = int16Array[i] / 0x8000; // Convert Int16 to Float32 + } + this.writeData(float32Array, payload.trackId); + } else if ( + payload.event === 'offset' || + payload.event === 'interrupt' + ) { + const requestId = payload.requestId; + const trackId = this.write.trackId; + const offset = this.trackSampleOffsets[trackId] || 0; + this.port.postMessage({ + event: 'offset', + requestId, + trackId, + offset, + }); + if (payload.event === 'interrupt') { + this.hasInterrupted = true; + } + } else { + throw new Error(\`Unhandled event "\${payload.event}"\`); + } + } + }; + } + + writeData(float32Array, trackId = null) { + let { buffer } = this.write; + let offset = this.writeOffset; + for (let i = 0; i < float32Array.length; i++) { + buffer[offset++] = float32Array[i]; + if (offset >= buffer.length) { + this.outputBuffers.push(this.write); + this.write = { buffer: new Float32Array(this.bufferLength), trackId }; + buffer = this.write.buffer; + offset = 0; + } + } + this.writeOffset = offset; + return true; + } + + process(inputs, outputs, parameters) { + const output = outputs[0]; + const outputChannelData = output[0]; + const outputBuffers = this.outputBuffers; + if (this.hasInterrupted) { + this.port.postMessage({ event: 'stop' }); + return false; + } else if (outputBuffers.length) { + this.hasStarted = true; + const { buffer, trackId } = outputBuffers.shift(); + for (let i = 0; i < outputChannelData.length; i++) { + outputChannelData[i] = buffer[i] || 0; + } + if (trackId) { + this.trackSampleOffsets[trackId] = + this.trackSampleOffsets[trackId] || 0; + this.trackSampleOffsets[trackId] += buffer.length; + } + return true; + } else if (this.hasStarted) { + this.port.postMessage({ event: 'stop' }); + return false; + } else { + return true; + } + } +} + +registerProcessor('stream_processor', StreamProcessor); +`; + +const script = new Blob([StreamProcessorWorklet], { + type: 'application/javascript' +}); +const src = URL.createObjectURL(script); +export const StreamProcessorSrc = src;