stt models (#147)

freddyaboulton · Mar 7, 2025 · 504eb45 · 504eb45
1 parent cbbfa17
commit 504eb45
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 65 deletions.
diff --git a/backend/fastrtc/reply_on_stopwords.py b/backend/fastrtc/reply_on_stopwords.py
@@ -13,7 +13,7 @@
     ReplyFnGenerator,
     ReplyOnPause,
 )
-from .speech_to_text import get_stt_model
+from .speech_to_text import get_stt_model, stt_for_chunks
 from .utils import audio_to_float32, create_message
 
 logger = logging.getLogger(__name__)
@@ -105,10 +105,9 @@ def determine_pause(  # type: ignore
                 dur_vad, chunks = self.model.vad(
                     (16000, state.post_stop_word_buffer),
                     self.model_options,
-                    return_chunks=True,
                 )
-                text = self.stt_model.stt_for_chunks(
-                    (16000, state.post_stop_word_buffer), chunks
+                text = stt_for_chunks(
+                    self.stt_model, (16000, state.post_stop_word_buffer), chunks
                 )
                 logger.debug(f"STT: {text}")
                 state.stop_word_detected = self.stop_word_detected(text)

diff --git a/backend/fastrtc/speech_to_text/__init__.py b/backend/fastrtc/speech_to_text/__init__.py
@@ -1,3 +1,3 @@
-from .stt_ import MoonshineSTT, get_stt_model
+from .stt_ import MoonshineSTT, get_stt_model, stt_for_chunks
 
-__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model"]
+__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model", "stt_for_chunks"]
diff --git a/backend/fastrtc/speech_to_text/stt_.py b/backend/fastrtc/speech_to_text/stt_.py
@@ -15,12 +15,6 @@
 class STTModel(Protocol):
     def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str: ...
 
-    def stt_for_chunks(
-        self,
-        audio: tuple[int, NDArray[np.int16 | np.float32]],
-        chunks: list[AudioChunk],
-    ) -> str: ...
-
 
 class MoonshineSTT(STTModel):
     def __init__(
@@ -49,19 +43,6 @@ def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str:
         tokens = self.model.generate(audio_np)
         return self.tokenizer.decode_batch(tokens)[0]
 
-    def stt_for_chunks(
-        self,
-        audio: tuple[int, NDArray[np.int16 | np.float32]],
-        chunks: list[AudioChunk],
-    ) -> str:
-        sr, audio_np = audio
-        return " ".join(
-            [
-                self.stt((sr, audio_np[chunk["start"] : chunk["end"]]))
-                for chunk in chunks
-            ]
-        )
-
 
 @lru_cache
 def get_stt_model(
@@ -79,3 +60,17 @@ def get_stt_model(
     m.stt((16000, audio))
     print(click.style("INFO", fg="green") + ":\t  STT model warmed up.")
     return m
+
+
+def stt_for_chunks(
+    stt_model: STTModel,
+    audio: tuple[int, NDArray[np.int16 | np.float32]],
+    chunks: list[AudioChunk],
+) -> str:
+    sr, audio_np = audio
+    return " ".join(
+        [
+            stt_model.stt((sr, audio_np[chunk["start"] : chunk["end"]]))
+            for chunk in chunks
+        ]
+    )
diff --git a/demo/nextjs_voice_chat/backend/server.py b/demo/nextjs_voice_chat/backend/server.py
@@ -1,5 +1,4 @@
 import fastapi
-from fastapi.responses import FileResponse
 from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
 from fastrtc.utils import audio_to_bytes
 from openai import OpenAI
@@ -9,7 +8,6 @@
 from elevenlabs import VoiceSettings, stream
 from elevenlabs.client import ElevenLabs
 import numpy as np
-import io
 
 from .env import LLM_API_KEY, ELEVENLABS_API_KEY
 
@@ -22,16 +20,14 @@
 
 messages = [{"role": "system", "content": sys_prompt}]
 
-openai_client = OpenAI(
-    api_key=LLM_API_KEY
-)
+openai_client = OpenAI(api_key=LLM_API_KEY)
 
 elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
 
 logging.basicConfig(level=logging.INFO)
 
-def echo(audio):
 
+def echo(audio):
     stt_time = time.time()
 
     logging.info("Performing STT")
@@ -54,18 +50,15 @@ def echo(audio):
     logging.info(f"STT took {time.time() - stt_time} seconds")
 
     llm_time = time.time()
-    
+
     def text_stream():
-        global full_response  
+        global full_response
         full_response = ""
-        
+
         response = openai_client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=messages,
-            max_tokens=200,
-            stream=True
+            model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
         )
-        
+
         for chunk in response:
             if chunk.choices[0].finish_reason == "stop":
                 break
@@ -77,41 +70,43 @@ def text_stream():
         text=text_stream(),
         voice="Rachel",  # Cassidy is also really good
         voice_settings=VoiceSettings(
-            similarity_boost=0.9,
-            stability=0.6,
-            style=0.4,
-            speed=1
+            similarity_boost=0.9, stability=0.6, style=0.4, speed=1
         ),
         model="eleven_multilingual_v2",
         output_format="pcm_24000",
-        stream=True
+        stream=True,
     )
 
     for audio_chunk in audio_stream:
-        audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
+        audio_array = (
+            np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
+        )
         yield (24000, audio_array)
 
     messages.append({"role": "assistant", "content": full_response + " "})
     logging.info(f"LLM response: {full_response}")
     logging.info(f"LLM took {time.time() - llm_time} seconds")
-
-
-stream = Stream(ReplyOnPause(echo,
-            algo_options=AlgoOptions(
-                audio_chunk_duration=0.5,
-                started_talking_threshold=0.1,
-                speech_threshold=0.03
-            ),
-            model_options=SileroVadOptions(
-                threshold=0.75,
-                min_speech_duration_ms=250,
-                min_silence_duration_ms=1500,
-                speech_pad_ms=400,
-                max_speech_duration_s=15
-            )), 
-            modality="audio", 
-            mode="send-receive"
-        )
+
+
+stream = Stream(
+    ReplyOnPause(
+        echo,
+        algo_options=AlgoOptions(
+            audio_chunk_duration=0.5,
+            started_talking_threshold=0.1,
+            speech_threshold=0.03,
+        ),
+        model_options=SileroVadOptions(
+            threshold=0.75,
+            min_speech_duration_ms=250,
+            min_silence_duration_ms=1500,
+            speech_pad_ms=400,
+            max_speech_duration_s=15,
+        ),
+    ),
+    modality="audio",
+    mode="send-receive",
+)
 
 app = fastapi.FastAPI()
 
@@ -125,9 +120,10 @@ def text_stream():
 
 stream.mount(app)
 
+
 @app.get("/reset")
 async def reset():
     global messages
     logging.info("Resetting chat")
     messages = [{"role": "system", "content": sys_prompt}]
-    return {"status": "success"}
+    return {"status": "success"}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -28,6 +28,7 @@ nav:
   - Cookbook: cookbook.md
   - Deployment: deployment.md
   - Advanced Configuration: advanced-configuration.md
+  - Speech-to-Text Gallery: speech_to_text_gallery.md
   - VAD Gallery: vad_gallery.md
   - Utils: utils.md
   - Frequently Asked Questions: faq.md

diff --git a/pyproject.toml b/pyproject.toml
@@ -83,4 +83,4 @@ packages = ["/backend/fastrtc"]
 
 [tool.ruff]
 target-version = "py310"
-extend-exclude = ["demo/phonic_chat"]
+extend-exclude = ["demo/phonic_chat", "demo/nextjs_voice_chat"]