From cf46d5ad63bc13dd4e7f938689fdb3c65a09e929 Mon Sep 17 00:00:00 2001 From: lloydzhou Date: Thu, 7 Nov 2024 01:12:08 +0800 Subject: [PATCH] upload response audio, and update audio_url to session message --- .../realtime-chat/realtime-chat.tsx | 23 +++++++++++--- app/lib/audio.ts | 30 +++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/app/components/realtime-chat/realtime-chat.tsx b/app/components/realtime-chat/realtime-chat.tsx index a46b19e795d..30181c0fac9 100644 --- a/app/components/realtime-chat/realtime-chat.tsx +++ b/app/components/realtime-chat/realtime-chat.tsx @@ -25,6 +25,7 @@ import { TurnDetection, } from "rt-client"; import { AudioHandler } from "@/app/lib/audio"; +import { uploadImage } from "@/app/utils/chat"; interface RealtimeChatProps { onClose?: () => void; @@ -126,11 +127,16 @@ export function RealtimeChat({ const handleResponse = async (response: RTResponse) => { for await (const item of response) { + console.log("handleResponse", item); if (item.type === "message" && item.role === "assistant") { const botMessage = createMessage({ role: item.role, content: "", }); + // add bot message first + chatStore.updateTargetSession(session, (session) => { + session.messages = session.messages.concat([botMessage]); + }); for await (const content of item) { if (content.type === "text") { for await (const text of content.textChunks()) { @@ -149,12 +155,18 @@ export function RealtimeChat({ } }; await Promise.all([textTask(), audioTask()]); - chatStore.updateTargetSession(session, (session) => { - botMessage.date = new Date().toLocaleString(); - session.messages = session.messages.concat([botMessage]); - }); } } + // upload audio get audio_url + const blob = audioHandlerRef.current?.savePlayFile(); + uploadImage(blob).then((audio_url) => { + botMessage.audio_url = audio_url; + botMessage.date = new Date().toLocaleString(); + // update text and audio_url + chatStore.updateTargetSession((session) => { + session.messages = session.messages.concat(); + }); + }); } } }; @@ -162,6 +174,9 @@ export function RealtimeChat({ const handleInputAudio = async (item: RTInputAudioItem) => { audioHandlerRef.current?.stopStreamingPlayback(); await item.waitForCompletion(); + const { audioStartMillis, audioEndMillis } = item; + // TODO, save input audio_url, and update session + console.log("handleInputAudio", item, audioStartMillis, audioEndMillis); const userMessage = createMessage({ role: "user", content: item.transcription, diff --git a/app/lib/audio.ts b/app/lib/audio.ts index b3674a2c525..15b9a3cb5eb 100644 --- a/app/lib/audio.ts +++ b/app/lib/audio.ts @@ -8,6 +8,7 @@ export class AudioHandler { private nextPlayTime: number = 0; private isPlaying: boolean = false; private playbackQueue: AudioBufferSourceNode[] = []; + private playBuffer: Int16Array[] = []; constructor() { this.context = new AudioContext({ sampleRate: this.sampleRate }); @@ -84,12 +85,14 @@ export class AudioHandler { this.isPlaying = false; this.playbackQueue.forEach((source) => source.stop()); this.playbackQueue = []; + this.playBuffer = []; } playChunk(chunk: Uint8Array) { if (!this.isPlaying) return; const int16Data = new Int16Array(chunk.buffer); + this.playBuffer.push.apply(this.playBuffer, int16Data); // save playBuffer const float32Data = new Float32Array(int16Data.length); for (let i = 0; i < int16Data.length; i++) { @@ -125,6 +128,33 @@ export class AudioHandler { this.nextPlayTime = this.context.currentTime; } } + _saveData(data: Int16Array, bytesPerSample = 16): Blob { + const headerLength = 44; + const numberOfChannels = 1; + const dataLength = data.length; + const wav = new Uint8Array(headerLength + dataLength * 2); + const view = new DataView(wav.buffer); + view.setUint32(0, 1380533830, false); // RIFF identifier 'RIFF' + view.setUint32(4, 36 + dataLength * 2, true); // file length minus RIFF identifier length and file description length + view.setUint32(8, 1463899717, false); // RIFF type 'WAVE' + view.setUint32(12, 1718449184, false); // format chunk identifier 'fmt ' + view.setUint32(16, 16, true); // format chunk length + view.setUint16(20, 1, true); // sample format (raw) + view.setUint16(22, numberOfChannels, true); // channel count + view.setUint32(24, this.sampleRate, true); // sample rate + view.setUint32(28, this.sampleRate * 4, true); // byte rate (sample rate * block align) + view.setUint16(32, numberOfChannels * 2, true); // block align (channel count * bytes per sample) + view.setUint16(34, bytesPerSample, true); // bits per sample + view.setUint32(36, 1684108385, false); // data chunk identifier 'data' + view.setUint32(40, dataLength * 2, true); // data chunk length + for (let i = 0; i < dataLength; i++) { + view.setInt16(44 + i * 2, data[i], true); + } + return new Blob([view], { type: "audio/mpeg" }); + } + savePlayFile() { + return this._saveData(this.playBuffer); + } async close() { this.workletNode?.disconnect(); this.source?.disconnect();