|
11 | 11 |
|
12 | 12 | class Pipeline:
|
13 | 13 | def __init__(self, **kwargs):
|
14 |
| - self.client = ComfyStreamClient(**kwargs, max_workers=5) # hardcoded max workers |
| 14 | + self.client = ComfyStreamClient(**kwargs, max_workers=5) # TODO: hardcoded max workers, should it be configurable? |
15 | 15 |
|
16 | 16 | self.video_futures = asyncio.Queue()
|
17 | 17 | self.audio_futures = asyncio.Queue()
|
18 |
| - |
19 |
| - self.resampler = av.audio.resampler.AudioResampler(format='s16', layout='mono', rate=48000) # find a better way to convert to mono |
20 | 18 |
|
21 | 19 | async def warm(self):
|
22 | 20 | dummy_video_inp = torch.randn(1, 512, 512, 3)
|
23 |
| - dummy_audio_inp = np.random.randint(-32768, 32767, 48 * 20, dtype=np.int16) # has to be more than the buffer size in comfy workflow |
| 21 | + dummy_audio_inp = np.random.randint(-32768, 32767, 48 * 20, dtype=np.int16) # TODO: might affect the workflow, due to buffering |
24 | 22 |
|
25 | 23 | for _ in range(WARMUP_RUNS):
|
26 | 24 | image_out_fut = self.client.put_video_input(dummy_video_inp)
|
@@ -54,7 +52,7 @@ def video_preprocess(self, frame: av.VideoFrame) -> torch.Tensor:
|
54 | 52 | return torch.from_numpy(frame_np).unsqueeze(0)
|
55 | 53 |
|
56 | 54 | def audio_preprocess(self, frame: av.AudioFrame) -> torch.Tensor:
|
57 |
| - return self.resampler.resample(frame)[0].to_ndarray().flatten() |
| 55 | + return frame.to_ndarray().ravel().reshape(-1, 2).mean(axis=1).astype(np.int16) |
58 | 56 |
|
59 | 57 | def video_postprocess(self, output: torch.Tensor) -> av.VideoFrame:
|
60 | 58 | return av.VideoFrame.from_ndarray(
|
|
0 commit comments