Skip to content

Commit

Permalink
feat: streaming whisper
Browse files Browse the repository at this point in the history
  • Loading branch information
varshith15 committed Dec 29, 2024
1 parent 29f6bb7 commit 21e4310
Show file tree
Hide file tree
Showing 10 changed files with 1,075 additions and 82 deletions.
32 changes: 24 additions & 8 deletions audio_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,36 @@
from comfystream.client import ComfyStreamClient

async def main():
cwd = "/home/user/ComfyUI"
client = ComfyStreamClient(cwd=cwd)

cwd = "/home/user/ComfyUI"
client = ComfyStreamClient(cwd=cwd, type="audio")
with open("./workflows/audio-whsiper-example-workflow.json", "r") as f:
prompt = json.load(f)

client.set_prompt(prompt)

waveform, _ = torchaudio.load("/home/user/harvard.wav")
waveform, sr = torchaudio.load("/home/user/harvard.wav")
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
waveform = resampler(waveform)
sr = 16000
if waveform.ndim > 1:
audio_tensor = waveform.mean(dim=0)
waveform = waveform.mean(dim=0, keepdim=True)

chunk_ms = 20
chunk_size = int(sr * (chunk_ms / 1000.0))

total_samples = waveform.shape[1]
offset = 0

results = []
while offset < total_samples:
end = min(offset + chunk_size, total_samples)
chunk = waveform[:, offset:end]
offset = end
results.append(await client.queue_prompt(chunk.numpy().squeeze()))

output = await client.queue_prompt(audio_tensor)
print(output)
print("Result:")
for result in results:
if result[0] is not None:
print(result[-1])

if __name__ == "__main__":
asyncio.run(main())
5 changes: 2 additions & 3 deletions nodes/audio_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from .apply_whisper import ApplyWhisper
from .load_audio_tensor import LoadAudioTensor
from .save_asr_response import SaveASRResponse
from .save_result import SaveResult
from .save_audio_tensor import SaveAudioTensor

NODE_CLASS_MAPPINGS = {"LoadAudioTensor": LoadAudioTensor, "SaveASRResponse": SaveASRResponse, "ApplyWhisper": ApplyWhisper, "SaveAudioTensor": SaveAudioTensor}
NODE_CLASS_MAPPINGS = {"LoadAudioTensor": LoadAudioTensor, "SaveResult": SaveResult, "SaveAudioTensor": SaveAudioTensor}

__all__ = ["NODE_CLASS_MAPPINGS"]
62 changes: 0 additions & 62 deletions nodes/audio_utils/apply_whisper.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from comfystream import tensor_cache

class SaveASRResponse:
class SaveResult:
CATEGORY = "audio_utils"
RETURN_TYPES = ()
FUNCTION = "execute"
Expand All @@ -10,15 +10,15 @@ class SaveASRResponse:
def INPUT_TYPES(s):
return {
"required": {
"data": ("DICT",),
"result": ("RESULT",),
}
}

@classmethod
def IS_CHANGED(s):
return float("nan")

def execute(self, data: dict):
def execute(self, result):
fut = tensor_cache.audio_outputs.pop()
fut.set_result(data)
return data
fut.set_result(result)
return result
5 changes: 5 additions & 0 deletions nodes/whisper_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .apply_whisper import ApplyWhisper

NODE_CLASS_MAPPINGS = {"ApplyWhisper": ApplyWhisper}

__all__ = ["NODE_CLASS_MAPPINGS"]
37 changes: 37 additions & 0 deletions nodes/whisper_utils/apply_whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from .whisper_online import FasterWhisperASR, VACOnlineASRProcessor

class ApplyWhisper:
@classmethod
def INPUT_TYPES(s):
return {
"required": {
"audio": ("AUDIO",),
}
}

CATEGORY = "whisper_utils"
RETURN_TYPES = ("RESULT",)
FUNCTION = "apply_whisper"

def __init__(self):
self.asr = FasterWhisperASR(
lan="en",
modelsize="large-v3",
cache_dir=None,
model_dir=None,
logfile=None
)
self.asr.use_vad()

self.online = VACOnlineASRProcessor(
online_chunk_size=0.5,
asr=self.asr,
tokenizer=None,
logfile=None,
buffer_trimming=("segment", 15)
)

def apply_whisper(self, audio):
self.online.insert_audio_chunk(audio)
result = self.online.process_iter()
return (result,)
146 changes: 146 additions & 0 deletions nodes/whisper_utils/silero_vad_iterator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import torch

# This is copied from silero-vad's vad_utils.py:
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
# (except changed defaults)

# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE

class VADIterator:
def __init__(self,
model,
threshold: float = 0.5,
sampling_rate: int = 16000,
min_silence_duration_ms: int = 500, # makes sense on one recording that I checked
speech_pad_ms: int = 100 # same
):

"""
Class for stream imitation
Parameters
----------
model: preloaded .jit silero VAD model
threshold: float (default - 0.5)
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
sampling_rate: int (default - 16000)
Currently silero VAD models support 8000 and 16000 sample rates
min_silence_duration_ms: int (default - 100 milliseconds)
In the end of each speech chunk wait for min_silence_duration_ms before separating it
speech_pad_ms: int (default - 30 milliseconds)
Final speech chunks are padded by speech_pad_ms each side
"""

self.model = model
self.threshold = threshold
self.sampling_rate = sampling_rate

if sampling_rate not in [8000, 16000]:
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')

self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
self.reset_states()

def reset_states(self):

self.model.reset_states()
self.triggered = False
self.temp_end = 0
self.current_sample = 0

def __call__(self, x, return_seconds=False):
"""
x: torch.Tensor
audio chunk (see examples in repo)
return_seconds: bool (default - False)
whether return timestamps in seconds (default - samples)
"""

if not torch.is_tensor(x):
try:
x = torch.Tensor(x)
except:
raise TypeError("Audio cannot be casted to tensor. Cast it manually")

window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
self.current_sample += window_size_samples

speech_prob = self.model(x, self.sampling_rate).item()

if (speech_prob >= self.threshold) and self.temp_end:
self.temp_end = 0

if (speech_prob >= self.threshold) and not self.triggered:
self.triggered = True
speech_start = self.current_sample - self.speech_pad_samples
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}

if (speech_prob < self.threshold - 0.15) and self.triggered:
if not self.temp_end:
self.temp_end = self.current_sample
if self.current_sample - self.temp_end < self.min_silence_samples:
return None
else:
speech_end = self.temp_end + self.speech_pad_samples
self.temp_end = 0
self.triggered = False
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}

return None

#######################
# because Silero now requires exactly 512-sized audio chunks

import numpy as np
class FixedVADIterator(VADIterator):
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
If audio to be processed at once is long and multiple voiced segments detected,
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
'''

def reset_states(self):
super().reset_states()
self.buffer = np.array([],dtype=np.float32)

def __call__(self, x, return_seconds=False):
self.buffer = np.append(self.buffer, x)
ret = None
while len(self.buffer) >= 512:
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
self.buffer = self.buffer[512:]
if ret is None:
ret = r
elif r is not None:
if 'end' in r:
ret['end'] = r['end'] # the latter end
if 'start' in r and 'end' in ret: # there is an earlier start.
# Remove end, merging this segment with the previous one.
del ret['end']
return ret if ret != {} else None

if __name__ == "__main__":
# test/demonstrate the need for FixedVADIterator:

import torch
model, _ = torch.hub.load(
repo_or_dir='snakers4/silero-vad',
model='silero_vad'
)
vac = FixedVADIterator(model)
# vac = VADIterator(model) # the second case crashes with this

# this works: for both
audio_buffer = np.array([0]*(512),dtype=np.float32)
vac(audio_buffer)

# this crashes on the non FixedVADIterator with
# ops.prim.RaiseException("Input audio chunk is too short", "builtins.ValueError")
audio_buffer = np.array([0]*(512-1),dtype=np.float32)
vac(audio_buffer)
Loading

0 comments on commit 21e4310

Please sign in to comment.