varshith15
diff --git a/‎nodes/audio_utils/__init__.py
+1-2 b/‎nodes/audio_utils/__init__.py
+1-2
diff --git a/‎nodes/audio_utils/load_audio_tensor.py
+15-14 b/‎nodes/audio_utils/load_audio_tensor.py
+15-14
diff --git a/‎nodes/audio_utils/save_audio_tensor.py
+17-3 b/‎nodes/audio_utils/save_audio_tensor.py
+17-3
diff --git a/‎server/app.py
+2-2 b/‎server/app.py
+2-2
diff --git a/‎server/pipeline.py
+25-56 b/‎server/pipeline.py
+25-56
@@ -1,7 +1,6 @@
 from .load_audio_tensor import LoadAudioTensor
-from .save_result import SaveResult
 from .save_audio_tensor import SaveAudioTensor
 
-NODE_CLASS_MAPPINGS = {"LoadAudioTensor": LoadAudioTensor, "SaveResult": SaveResult, "SaveAudioTensor": SaveAudioTensor}
+NODE_CLASS_MAPPINGS = {"LoadAudioTensor": LoadAudioTensor, "SaveAudioTensor": SaveAudioTensor}
 
 __all__ = ["NODE_CLASS_MAPPINGS"]
@@ -8,29 +8,30 @@ class LoadAudioTensor:
     FUNCTION = "execute"
 
     def __init__(self):
-        self.audio_buffer = np.array([])
+        self.audio_buffer = np.array([], dtype=np.int16)
+        self.buffer_samples = None
 
     @classmethod
     def INPUT_TYPES(s):
         return {
             "required": {
-                "buffer_size": ("FLOAT", {"default": 500.0})
+                "buffer_size": ("FLOAT", {"default": 500.0}),
+                "sample_rate": ("INT", {"default": 48000})
             }
         }
 
     @classmethod
     def IS_CHANGED():
         return float("nan")
 
-    def execute(self, buffer_size):
-        audio = tensor_cache.audio_inputs.get(block=True)
-        self.audio_buffer = np.concatenate((self.audio_buffer, audio))
-        
-        buffer_size_samples = int(buffer_size * 48)
-
-        if self.audio_buffer.size >= buffer_size_samples:
-            buffered_audio = self.audio_buffer[:buffer_size_samples]
-            self.audio_buffer = self.audio_buffer[buffer_size_samples:]
-            return (buffered_audio,)
-        else:
-            return (None,)
+    def execute(self, buffer_size, sample_rate):
+        if not self.buffer_samples:
+            self.buffer_samples = int(buffer_size * sample_rate / 1000)
+
+        while self.audio_buffer.size < self.buffer_samples:
+            audio = tensor_cache.audio_inputs.get()
+            self.audio_buffer = np.concatenate((self.audio_buffer, audio))
+
+        buffered_audio = self.audio_buffer
+        self.audio_buffer = np.array([], dtype=np.int16)
+        return (buffered_audio,)
@@ -1,25 +1,39 @@
+import queue
+import logging
 from comfystream import tensor_cache
 
+logger = logging.getLogger(__name__)
 
 class SaveAudioTensor:
     CATEGORY = "audio_utils"
     RETURN_TYPES = ()
     FUNCTION = "execute"
     OUTPUT_NODE = True
 
+    def __init__(self):
+        self.frame_samples = None
+
     @classmethod
     def INPUT_TYPES(s):
         return {
             "required": {
                 "audio": ("AUDIO",),
+                "frame_size": ("FLOAT", {"default": 20.0}),
+                "sample_rate": ("INT", {"default": 48000})
             }
         }
 
     @classmethod
     def IS_CHANGED(s):
         return float("nan")
 
-    def execute(self, audio):
-        fut = tensor_cache.audio_outputs.get()
-        fut.set_result((audio))
+    def execute(self, audio, frame_size, sample_rate):
+        if self.frame_samples is None:
+            self.frame_samples = int(frame_size * sample_rate / 1000)
+            
+        for idx in range(0, len(audio), self.frame_samples):
+            frame = audio[idx:idx + self.frame_samples]
+            fut = tensor_cache.audio_outputs.get()
+            fut.set_result(frame)
         return (audio,)
+
@@ -106,7 +106,7 @@ async def offer(request):
 
     params = await request.json()
 
-    pipeline.set_prompts(params["prompts"])
+    await pipeline.set_prompts(params["prompts"])
     await pipeline.warm()
 
     offer_params = params["offer"]
@@ -213,7 +213,7 @@ async def set_prompt(request):
     pipeline = request.app["pipeline"]
 
     prompt = await request.json()
-    pipeline.set_prompts(prompt)
+    await pipeline.set_prompts(prompt)
 
     return web.Response(content_type="application/json", text="OK")
 
 
@@ -1,16 +1,12 @@
-import torch
 import av
+import torch
 import numpy as np
-import fractions
 import asyncio
 
-from av import AudioFrame
-from typing import Any, Dict, Optional, Union, List
+from typing import Any, Dict, Union, List
 from comfystream.client import ComfyStreamClient
-from comfystream import tensor_cache
-
-WARMUP_RUNS = 5
 
+WARMUP_RUNS = 10
 
 
 class Pipeline:
@@ -19,53 +15,39 @@ def __init__(self, **kwargs):
 
         self.video_futures = asyncio.Queue()
         self.audio_futures = asyncio.Queue()
-
-        self.audio_output_frames = []
 
         self.resampler = av.audio.resampler.AudioResampler(format='s16', layout='mono', rate=48000) # find a better way to convert to mono
-        self.sample_rate = 48000 # instead of hardcoding, find a clean way to set from audio frame
-        self.frame_size = int(self.sample_rate * 0.02)
-        self.time_base = fractions.Fraction(1, self.sample_rate)
-        self.curr_pts = 0 # figure out a better way to set back pts to processed audio frames
-
-    def set_prompt(self, prompt: Dict[Any, Any]):
-        self.client.set_prompt(prompt)
 
     async def warm(self):
-        dummy_video_frame = torch.randn(1, 512, 512, 3)
-        dummy_audio_frame = np.random.randint(-32768, 32767, 48000 * 1, dtype=np.int16)
+        dummy_video_inp = torch.randn(1, 512, 512, 3)
+        dummy_audio_inp = np.random.randint(-32768, 32767, 48 * 20, dtype=np.int16)  # has to be more than the buffer size in comfy workflow
 
         for _ in range(WARMUP_RUNS):
-            image_out_fut = asyncio.Future()
-            audio_out_fut = asyncio.Future()
-            tensor_cache.image_outputs.put(image_out_fut)
-            tensor_cache.audio_outputs.put(audio_out_fut)
+            image_out_fut = self.client.put_video_input(dummy_video_inp)
+            await image_out_fut
 
-            tensor_cache.image_inputs.put(dummy_video_frame)
-            tensor_cache.audio_inputs.put(dummy_audio_frame)
+        futs = []
+        for _ in range(WARMUP_RUNS):
+            audio_out_fut = self.client.put_audio_input(dummy_audio_inp)
+            futs.append(audio_out_fut)
 
-            await image_out_fut
-            await audio_out_fut
+        await asyncio.gather(*futs)
 
-    def set_prompts(self, prompts: Union[Dict[Any, Any], List[Dict[Any, Any]]]):
+    async def set_prompts(self, prompts: Union[Dict[Any, Any], List[Dict[Any, Any]]]):
         if isinstance(prompts, dict):
-            self.client.set_prompts([prompts])
+            await self.client.set_prompts([prompts])
         else:
-            self.client.set_prompts(prompts)
+            await self.client.set_prompts(prompts)
 
     async def put_video_frame(self, frame: av.VideoFrame):
         inp_tensor = self.video_preprocess(frame)
-        out_future = asyncio.Future()
-        tensor_cache.image_outputs.put(out_future)
-        tensor_cache.image_inputs.put(inp_tensor)
+        out_future = self.client.put_video_input(inp_tensor)
         await self.video_futures.put((out_future, frame.pts, frame.time_base))
 
     async def put_audio_frame(self, frame: av.AudioFrame):
         inp_tensor = self.audio_preprocess(frame)
-        out_future = asyncio.Future()
-        tensor_cache.audio_outputs.put(out_future)
-        tensor_cache.audio_inputs.put(inp_tensor)
-        await self.audio_futures.put(out_future)
+        out_future = self.client.put_audio_input(inp_tensor)
+        await self.audio_futures.put((out_future, frame.pts, frame.time_base, frame.sample_rate))
 
     def video_preprocess(self, frame: av.VideoFrame) -> torch.Tensor:
         frame_np = frame.to_ndarray(format="rgb24").astype(np.float32) / 255.0
@@ -80,18 +62,7 @@ def video_postprocess(self, output: torch.Tensor) -> av.VideoFrame:
         )
 
     def audio_postprocess(self, output: torch.Tensor) -> av.AudioFrame:
-        frames = []
-        for idx in range(0, len(output), self.frame_size):
-            frame_samples = output[idx:idx + self.frame_size]
-            frame_samples = frame_samples.reshape(1, -1).astype(np.int16)
-            frame = AudioFrame.from_ndarray(frame_samples, layout="mono")
-            frame.sample_rate = self.sample_rate
-            frame.pts = self.curr_pts
-            frame.time_base = self.time_base
-            self.curr_pts += 960
-
-            frames.append(frame)
-        return frames
+        return av.AudioFrame.from_ndarray(output.reshape(1, -1), layout="mono")
 
     async def get_processed_video_frame(self):
         out_fut, pts, time_base = await self.video_futures.get()
@@ -101,14 +72,12 @@ async def get_processed_video_frame(self):
         return frame
 
     async def get_processed_audio_frame(self):
-        while not self.audio_output_frames:
-            out_fut = await self.audio_futures.get()
-            output = await out_fut
-            if output is None:
-                print("No Audio output")
-                continue
-            self.audio_output_frames.extend(self.audio_postprocess(output))
-        return self.audio_output_frames.pop(0)
+        out_fut, pts, time_base, sample_rate = await self.audio_futures.get()
+        frame = self.audio_postprocess(await out_fut)
+        frame.pts = pts
+        frame.time_base = time_base
+        frame.sample_rate = sample_rate
+        return frame
 
     async def get_nodes_info(self) -> Dict[str, Any]:
         """Get information about all nodes in the current prompt including metadata."""