Skip to content

Commit

Permalink
stt models (#147)
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyaboulton authored Mar 7, 2025
1 parent cbbfa17 commit 504eb45
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 65 deletions.
7 changes: 3 additions & 4 deletions backend/fastrtc/reply_on_stopwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
ReplyFnGenerator,
ReplyOnPause,
)
from .speech_to_text import get_stt_model
from .speech_to_text import get_stt_model, stt_for_chunks
from .utils import audio_to_float32, create_message

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -105,10 +105,9 @@ def determine_pause( # type: ignore
dur_vad, chunks = self.model.vad(
(16000, state.post_stop_word_buffer),
self.model_options,
return_chunks=True,
)
text = self.stt_model.stt_for_chunks(
(16000, state.post_stop_word_buffer), chunks
text = stt_for_chunks(
self.stt_model, (16000, state.post_stop_word_buffer), chunks
)
logger.debug(f"STT: {text}")
state.stop_word_detected = self.stop_word_detected(text)
Expand Down
4 changes: 2 additions & 2 deletions backend/fastrtc/speech_to_text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .stt_ import MoonshineSTT, get_stt_model
from .stt_ import MoonshineSTT, get_stt_model, stt_for_chunks

__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model"]
__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model", "stt_for_chunks"]
33 changes: 14 additions & 19 deletions backend/fastrtc/speech_to_text/stt_.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@
class STTModel(Protocol):
def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str: ...

def stt_for_chunks(
self,
audio: tuple[int, NDArray[np.int16 | np.float32]],
chunks: list[AudioChunk],
) -> str: ...


class MoonshineSTT(STTModel):
def __init__(
Expand Down Expand Up @@ -49,19 +43,6 @@ def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str:
tokens = self.model.generate(audio_np)
return self.tokenizer.decode_batch(tokens)[0]

def stt_for_chunks(
self,
audio: tuple[int, NDArray[np.int16 | np.float32]],
chunks: list[AudioChunk],
) -> str:
sr, audio_np = audio
return " ".join(
[
self.stt((sr, audio_np[chunk["start"] : chunk["end"]]))
for chunk in chunks
]
)


@lru_cache
def get_stt_model(
Expand All @@ -79,3 +60,17 @@ def get_stt_model(
m.stt((16000, audio))
print(click.style("INFO", fg="green") + ":\t STT model warmed up.")
return m


def stt_for_chunks(
stt_model: STTModel,
audio: tuple[int, NDArray[np.int16 | np.float32]],
chunks: list[AudioChunk],
) -> str:
sr, audio_np = audio
return " ".join(
[
stt_model.stt((sr, audio_np[chunk["start"] : chunk["end"]]))
for chunk in chunks
]
)
74 changes: 35 additions & 39 deletions demo/nextjs_voice_chat/backend/server.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import fastapi
from fastapi.responses import FileResponse
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
from fastrtc.utils import audio_to_bytes
from openai import OpenAI
Expand All @@ -9,7 +8,6 @@
from elevenlabs import VoiceSettings, stream
from elevenlabs.client import ElevenLabs
import numpy as np
import io

from .env import LLM_API_KEY, ELEVENLABS_API_KEY

Expand All @@ -22,16 +20,14 @@

messages = [{"role": "system", "content": sys_prompt}]

openai_client = OpenAI(
api_key=LLM_API_KEY
)
openai_client = OpenAI(api_key=LLM_API_KEY)

elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)

logging.basicConfig(level=logging.INFO)

def echo(audio):

def echo(audio):
stt_time = time.time()

logging.info("Performing STT")
Expand All @@ -54,18 +50,15 @@ def echo(audio):
logging.info(f"STT took {time.time() - stt_time} seconds")

llm_time = time.time()

def text_stream():
global full_response
global full_response
full_response = ""

response = openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=200,
stream=True
model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
)

for chunk in response:
if chunk.choices[0].finish_reason == "stop":
break
Expand All @@ -77,41 +70,43 @@ def text_stream():
text=text_stream(),
voice="Rachel", # Cassidy is also really good
voice_settings=VoiceSettings(
similarity_boost=0.9,
stability=0.6,
style=0.4,
speed=1
similarity_boost=0.9, stability=0.6, style=0.4, speed=1
),
model="eleven_multilingual_v2",
output_format="pcm_24000",
stream=True
stream=True,
)

for audio_chunk in audio_stream:
audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
audio_array = (
np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
)
yield (24000, audio_array)

messages.append({"role": "assistant", "content": full_response + " "})
logging.info(f"LLM response: {full_response}")
logging.info(f"LLM took {time.time() - llm_time} seconds")


stream = Stream(ReplyOnPause(echo,
algo_options=AlgoOptions(
audio_chunk_duration=0.5,
started_talking_threshold=0.1,
speech_threshold=0.03
),
model_options=SileroVadOptions(
threshold=0.75,
min_speech_duration_ms=250,
min_silence_duration_ms=1500,
speech_pad_ms=400,
max_speech_duration_s=15
)),
modality="audio",
mode="send-receive"
)


stream = Stream(
ReplyOnPause(
echo,
algo_options=AlgoOptions(
audio_chunk_duration=0.5,
started_talking_threshold=0.1,
speech_threshold=0.03,
),
model_options=SileroVadOptions(
threshold=0.75,
min_speech_duration_ms=250,
min_silence_duration_ms=1500,
speech_pad_ms=400,
max_speech_duration_s=15,
),
),
modality="audio",
mode="send-receive",
)

app = fastapi.FastAPI()

Expand All @@ -125,9 +120,10 @@ def text_stream():

stream.mount(app)


@app.get("/reset")
async def reset():
global messages
logging.info("Resetting chat")
messages = [{"role": "system", "content": sys_prompt}]
return {"status": "success"}
return {"status": "success"}
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ nav:
- Cookbook: cookbook.md
- Deployment: deployment.md
- Advanced Configuration: advanced-configuration.md
- Speech-to-Text Gallery: speech_to_text_gallery.md
- VAD Gallery: vad_gallery.md
- Utils: utils.md
- Frequently Asked Questions: faq.md
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,4 @@ packages = ["/backend/fastrtc"]

[tool.ruff]
target-version = "py310"
extend-exclude = ["demo/phonic_chat"]
extend-exclude = ["demo/phonic_chat", "demo/nextjs_voice_chat"]

0 comments on commit 504eb45

Please sign in to comment.