feat: streaming whisper

yondonfu · Dec 29, 2024 · 21e4310 · 21e4310
1 parent 29f6bb7
commit 21e4310
Show file tree

Hide file tree

Showing 10 changed files with 1,075 additions and 82 deletions.
diff --git a/audio_example.py b/audio_example.py
@@ -5,20 +5,36 @@
 from comfystream.client import ComfyStreamClient
 
 async def main():
-    cwd = "/home/user/ComfyUI"
-    client = ComfyStreamClient(cwd=cwd)
-
+    cwd = "/home/user/ComfyUI"        
+    client = ComfyStreamClient(cwd=cwd, type="audio")
     with open("./workflows/audio-whsiper-example-workflow.json", "r") as f:
         prompt = json.load(f)
 
     client.set_prompt(prompt)
-
-    waveform, _ = torchaudio.load("/home/user/harvard.wav")
+    waveform, sr = torchaudio.load("/home/user/harvard.wav")
+    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+    waveform = resampler(waveform)
+    sr = 16000
     if waveform.ndim > 1:
-        audio_tensor = waveform.mean(dim=0)
+        waveform = waveform.mean(dim=0, keepdim=True)
+
+    chunk_ms = 20
+    chunk_size = int(sr * (chunk_ms / 1000.0))
+
+    total_samples = waveform.shape[1]
+    offset = 0
+
+    results = []
+    while offset < total_samples:
+        end = min(offset + chunk_size, total_samples)
+        chunk = waveform[:, offset:end]
+        offset = end
+        results.append(await client.queue_prompt(chunk.numpy().squeeze()))
 
-    output = await client.queue_prompt(audio_tensor)
-    print(output)
+    print("Result:")
+    for result in results:
+        if result[0] is not None:
+            print(result[-1])
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/nodes/audio_utils/__init__.py b/nodes/audio_utils/__init__.py
@@ -1,8 +1,7 @@
-from .apply_whisper import ApplyWhisper
 from .load_audio_tensor import LoadAudioTensor
-from .save_asr_response import SaveASRResponse
+from .save_result import SaveResult
 from .save_audio_tensor import SaveAudioTensor
 
-NODE_CLASS_MAPPINGS = {"LoadAudioTensor": LoadAudioTensor, "SaveASRResponse": SaveASRResponse, "ApplyWhisper": ApplyWhisper, "SaveAudioTensor": SaveAudioTensor}
+NODE_CLASS_MAPPINGS = {"LoadAudioTensor": LoadAudioTensor, "SaveResult": SaveResult, "SaveAudioTensor": SaveAudioTensor}
 
 __all__ = ["NODE_CLASS_MAPPINGS"]
diff --git a/nodes/audio_utils/apply_whisper.py b/nodes/audio_utils/apply_whisper.py
diff --git a/nodes/audio_utils/save_asr_response.py → nodes/audio_utils/save_result.py b/nodes/audio_utils/save_asr_response.py → nodes/audio_utils/save_result.py
@@ -1,6 +1,6 @@
 from comfystream import tensor_cache
 
-class SaveASRResponse:
+class SaveResult:
     CATEGORY = "audio_utils"
     RETURN_TYPES = ()
     FUNCTION = "execute"
@@ -10,15 +10,15 @@ class SaveASRResponse:
     def INPUT_TYPES(s):
         return {
             "required": {
-                "data": ("DICT",),
+                "result": ("RESULT",),
             }
         }
 
     @classmethod
     def IS_CHANGED(s):
         return float("nan")
 
-    def execute(self, data: dict):
+    def execute(self, result):
         fut = tensor_cache.audio_outputs.pop()
-        fut.set_result(data)
-        return data
+        fut.set_result(result)
+        return result
diff --git a/nodes/whisper_utils/__init__.py b/nodes/whisper_utils/__init__.py
@@ -0,0 +1,5 @@
+from .apply_whisper import ApplyWhisper
+
+NODE_CLASS_MAPPINGS = {"ApplyWhisper": ApplyWhisper}
+
+__all__ = ["NODE_CLASS_MAPPINGS"]
diff --git a/nodes/whisper_utils/apply_whisper.py b/nodes/whisper_utils/apply_whisper.py
@@ -0,0 +1,37 @@
+from .whisper_online import FasterWhisperASR, VACOnlineASRProcessor
+
+class ApplyWhisper:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+            }
+        }
+
+    CATEGORY = "whisper_utils"
+    RETURN_TYPES = ("RESULT",)
+    FUNCTION = "apply_whisper"
+
+    def __init__(self):
+        self.asr = FasterWhisperASR(
+            lan="en", 
+            modelsize="large-v3", 
+            cache_dir=None, 
+            model_dir=None, 
+            logfile=None
+        )
+        self.asr.use_vad()
+
+        self.online = VACOnlineASRProcessor(
+            online_chunk_size=0.5,
+            asr=self.asr,
+            tokenizer=None,
+            logfile=None,
+            buffer_trimming=("segment", 15)
+        )
+
+    def apply_whisper(self, audio):
+        self.online.insert_audio_chunk(audio)
+        result = self.online.process_iter()
+        return (result,)
diff --git a/nodes/whisper_utils/silero_vad_iterator.py b/nodes/whisper_utils/silero_vad_iterator.py
@@ -0,0 +1,146 @@
+import torch
+
+# This is copied from silero-vad's vad_utils.py:
+# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
+# (except changed defaults)
+
+# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
+
+class VADIterator:
+    def __init__(self,
+                 model,
+                 threshold: float = 0.5,
+                 sampling_rate: int = 16000,
+                 min_silence_duration_ms: int = 500,  # makes sense on one recording that I checked
+                 speech_pad_ms: int = 100             # same 
+                 ):
+
+        """
+        Class for stream imitation
+
+        Parameters
+        ----------
+        model: preloaded .jit silero VAD model
+
+        threshold: float (default - 0.5)
+            Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
+            It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+
+        sampling_rate: int (default - 16000)
+            Currently silero VAD models support 8000 and 16000 sample rates
+
+        min_silence_duration_ms: int (default - 100 milliseconds)
+            In the end of each speech chunk wait for min_silence_duration_ms before separating it
+
+        speech_pad_ms: int (default - 30 milliseconds)
+            Final speech chunks are padded by speech_pad_ms each side
+        """
+
+        self.model = model
+        self.threshold = threshold
+        self.sampling_rate = sampling_rate
+
+        if sampling_rate not in [8000, 16000]:
+            raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
+
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        self.reset_states()
+
+    def reset_states(self):
+
+        self.model.reset_states()
+        self.triggered = False
+        self.temp_end = 0
+        self.current_sample = 0
+
+    def __call__(self, x, return_seconds=False):
+        """
+        x: torch.Tensor
+            audio chunk (see examples in repo)
+
+        return_seconds: bool (default - False)
+            whether return timestamps in seconds (default - samples)
+        """
+
+        if not torch.is_tensor(x):
+            try:
+                x = torch.Tensor(x)
+            except:
+                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
+
+        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+        self.current_sample += window_size_samples
+
+        speech_prob = self.model(x, self.sampling_rate).item()
+
+        if (speech_prob >= self.threshold) and self.temp_end:
+            self.temp_end = 0
+
+        if (speech_prob >= self.threshold) and not self.triggered:
+            self.triggered = True
+            speech_start = self.current_sample - self.speech_pad_samples
+            return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
+
+        if (speech_prob < self.threshold - 0.15) and self.triggered:
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return None
+            else:
+                speech_end = self.temp_end + self.speech_pad_samples
+                self.temp_end = 0
+                self.triggered = False
+                return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
+
+        return None
+
+#######################
+# because Silero now requires exactly 512-sized audio chunks 
+
+import numpy as np
+class FixedVADIterator(VADIterator):
+    '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
+    If audio to be processed at once is long and multiple voiced segments detected, 
+    then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment. 
+    '''
+
+    def reset_states(self):
+        super().reset_states()
+        self.buffer = np.array([],dtype=np.float32)
+
+    def __call__(self, x, return_seconds=False):
+        self.buffer = np.append(self.buffer, x) 
+        ret = None
+        while len(self.buffer) >= 512:
+            r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
+            self.buffer = self.buffer[512:]
+            if ret is None:
+                ret = r
+            elif r is not None:
+                if 'end' in r:
+                    ret['end'] = r['end']  # the latter end
+                if 'start' in r and 'end' in ret:  # there is an earlier start.
+                    # Remove end, merging this segment with the previous one.
+                    del ret['end']
+        return ret if ret != {} else None
+
+if __name__ == "__main__":
+    # test/demonstrate the need for FixedVADIterator:
+
+    import torch
+    model, _ = torch.hub.load(
+        repo_or_dir='snakers4/silero-vad',
+        model='silero_vad'
+    )
+    vac = FixedVADIterator(model)
+#   vac = VADIterator(model)  # the second case crashes with this
+
+    # this works: for both
+    audio_buffer = np.array([0]*(512),dtype=np.float32)
+    vac(audio_buffer)
+
+    # this crashes on the non FixedVADIterator with 
+    # ops.prim.RaiseException("Input audio chunk is too short", "builtins.ValueError")
+    audio_buffer = np.array([0]*(512-1),dtype=np.float32)
+    vac(audio_buffer)