diff --git a/backend/fastrtc/reply_on_stopwords.py b/backend/fastrtc/reply_on_stopwords.py index 6a05e76..b871c1a 100644 --- a/backend/fastrtc/reply_on_stopwords.py +++ b/backend/fastrtc/reply_on_stopwords.py @@ -13,7 +13,7 @@ ReplyFnGenerator, ReplyOnPause, ) -from .speech_to_text import get_stt_model +from .speech_to_text import get_stt_model, stt_for_chunks from .utils import audio_to_float32, create_message logger = logging.getLogger(__name__) @@ -105,10 +105,9 @@ def determine_pause( # type: ignore dur_vad, chunks = self.model.vad( (16000, state.post_stop_word_buffer), self.model_options, - return_chunks=True, ) - text = self.stt_model.stt_for_chunks( - (16000, state.post_stop_word_buffer), chunks + text = stt_for_chunks( + self.stt_model, (16000, state.post_stop_word_buffer), chunks ) logger.debug(f"STT: {text}") state.stop_word_detected = self.stop_word_detected(text) diff --git a/backend/fastrtc/speech_to_text/__init__.py b/backend/fastrtc/speech_to_text/__init__.py index 9dec76b..92fc2d8 100644 --- a/backend/fastrtc/speech_to_text/__init__.py +++ b/backend/fastrtc/speech_to_text/__init__.py @@ -1,3 +1,3 @@ -from .stt_ import MoonshineSTT, get_stt_model +from .stt_ import MoonshineSTT, get_stt_model, stt_for_chunks -__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model"] +__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model", "stt_for_chunks"] diff --git a/backend/fastrtc/speech_to_text/stt_.py b/backend/fastrtc/speech_to_text/stt_.py index d528d0e..f8d7a6c 100644 --- a/backend/fastrtc/speech_to_text/stt_.py +++ b/backend/fastrtc/speech_to_text/stt_.py @@ -15,12 +15,6 @@ class STTModel(Protocol): def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str: ... - def stt_for_chunks( - self, - audio: tuple[int, NDArray[np.int16 | np.float32]], - chunks: list[AudioChunk], - ) -> str: ... - class MoonshineSTT(STTModel): def __init__( @@ -49,19 +43,6 @@ def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str: tokens = self.model.generate(audio_np) return self.tokenizer.decode_batch(tokens)[0] - def stt_for_chunks( - self, - audio: tuple[int, NDArray[np.int16 | np.float32]], - chunks: list[AudioChunk], - ) -> str: - sr, audio_np = audio - return " ".join( - [ - self.stt((sr, audio_np[chunk["start"] : chunk["end"]])) - for chunk in chunks - ] - ) - @lru_cache def get_stt_model( @@ -79,3 +60,17 @@ def get_stt_model( m.stt((16000, audio)) print(click.style("INFO", fg="green") + ":\t STT model warmed up.") return m + + +def stt_for_chunks( + stt_model: STTModel, + audio: tuple[int, NDArray[np.int16 | np.float32]], + chunks: list[AudioChunk], +) -> str: + sr, audio_np = audio + return " ".join( + [ + stt_model.stt((sr, audio_np[chunk["start"] : chunk["end"]])) + for chunk in chunks + ] + ) diff --git a/demo/nextjs_voice_chat/backend/server.py b/demo/nextjs_voice_chat/backend/server.py index ef3b687..f0a1eb1 100644 --- a/demo/nextjs_voice_chat/backend/server.py +++ b/demo/nextjs_voice_chat/backend/server.py @@ -1,5 +1,4 @@ import fastapi -from fastapi.responses import FileResponse from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions from fastrtc.utils import audio_to_bytes from openai import OpenAI @@ -9,7 +8,6 @@ from elevenlabs import VoiceSettings, stream from elevenlabs.client import ElevenLabs import numpy as np -import io from .env import LLM_API_KEY, ELEVENLABS_API_KEY @@ -22,16 +20,14 @@ messages = [{"role": "system", "content": sys_prompt}] -openai_client = OpenAI( - api_key=LLM_API_KEY -) +openai_client = OpenAI(api_key=LLM_API_KEY) elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) logging.basicConfig(level=logging.INFO) -def echo(audio): +def echo(audio): stt_time = time.time() logging.info("Performing STT") @@ -54,18 +50,15 @@ def echo(audio): logging.info(f"STT took {time.time() - stt_time} seconds") llm_time = time.time() - + def text_stream(): - global full_response + global full_response full_response = "" - + response = openai_client.chat.completions.create( - model="gpt-3.5-turbo", - messages=messages, - max_tokens=200, - stream=True + model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True ) - + for chunk in response: if chunk.choices[0].finish_reason == "stop": break @@ -77,41 +70,43 @@ def text_stream(): text=text_stream(), voice="Rachel", # Cassidy is also really good voice_settings=VoiceSettings( - similarity_boost=0.9, - stability=0.6, - style=0.4, - speed=1 + similarity_boost=0.9, stability=0.6, style=0.4, speed=1 ), model="eleven_multilingual_v2", output_format="pcm_24000", - stream=True + stream=True, ) for audio_chunk in audio_stream: - audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0 + audio_array = ( + np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0 + ) yield (24000, audio_array) messages.append({"role": "assistant", "content": full_response + " "}) logging.info(f"LLM response: {full_response}") logging.info(f"LLM took {time.time() - llm_time} seconds") - - -stream = Stream(ReplyOnPause(echo, - algo_options=AlgoOptions( - audio_chunk_duration=0.5, - started_talking_threshold=0.1, - speech_threshold=0.03 - ), - model_options=SileroVadOptions( - threshold=0.75, - min_speech_duration_ms=250, - min_silence_duration_ms=1500, - speech_pad_ms=400, - max_speech_duration_s=15 - )), - modality="audio", - mode="send-receive" - ) + + +stream = Stream( + ReplyOnPause( + echo, + algo_options=AlgoOptions( + audio_chunk_duration=0.5, + started_talking_threshold=0.1, + speech_threshold=0.03, + ), + model_options=SileroVadOptions( + threshold=0.75, + min_speech_duration_ms=250, + min_silence_duration_ms=1500, + speech_pad_ms=400, + max_speech_duration_s=15, + ), + ), + modality="audio", + mode="send-receive", +) app = fastapi.FastAPI() @@ -125,9 +120,10 @@ def text_stream(): stream.mount(app) + @app.get("/reset") async def reset(): global messages logging.info("Resetting chat") messages = [{"role": "system", "content": sys_prompt}] - return {"status": "success"} \ No newline at end of file + return {"status": "success"} diff --git a/mkdocs.yml b/mkdocs.yml index 0c3d93a..91bf335 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,6 +28,7 @@ nav: - Cookbook: cookbook.md - Deployment: deployment.md - Advanced Configuration: advanced-configuration.md + - Speech-to-Text Gallery: speech_to_text_gallery.md - VAD Gallery: vad_gallery.md - Utils: utils.md - Frequently Asked Questions: faq.md diff --git a/pyproject.toml b/pyproject.toml index 070322e..79684b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,4 +83,4 @@ packages = ["/backend/fastrtc"] [tool.ruff] target-version = "py310" -extend-exclude = ["demo/phonic_chat"] +extend-exclude = ["demo/phonic_chat", "demo/nextjs_voice_chat"]