diff --git a/app/client/api.ts b/app/client/api.ts index 9f2cb23053e..9d9977deb24 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -64,16 +64,6 @@ export interface SpeechOptions { onController?: (controller: AbortController) => void; } -export interface TranscriptionOptions { - model?: "whisper-1"; - file: Blob; - language?: string; - prompt?: string; - response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt"; - temperature?: number; - onController?: (controller: AbortController) => void; -} - export interface ChatOptions { messages: RequestMessage[]; config: LLMConfig; @@ -109,7 +99,6 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; abstract speech(options: SpeechOptions): Promise; - abstract transcription(options: TranscriptionOptions): Promise; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index e839c69f01f..4ade9ebb98f 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -13,7 +13,6 @@ import { LLMApi, LLMModel, SpeechOptions, - TranscriptionOptions, MultimodalContent, } from "../api"; import Locale from "../../locales"; @@ -88,9 +87,6 @@ export class QwenApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts index 2ab67ed1371..e624a2e16cf 100644 --- a/app/client/platforms/anthropic.ts +++ b/app/client/platforms/anthropic.ts @@ -5,7 +5,6 @@ import { LLMApi, MultimodalContent, SpeechOptions, - TranscriptionOptions, } from "../api"; import { useAccessStore, @@ -90,9 +89,6 @@ export class ClaudeApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } extractMessage(res: any) { console.log("[Response] claude response: ", res); diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts index 0c2be5fb14b..c360417c602 100644 --- a/app/client/platforms/baidu.ts +++ b/app/client/platforms/baidu.ts @@ -15,7 +15,6 @@ import { LLMModel, MultimodalContent, SpeechOptions, - TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -80,9 +79,6 @@ export class ErnieApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts index 5a0c9b8b12e..a6e2d426ee3 100644 --- a/app/client/platforms/bytedance.ts +++ b/app/client/platforms/bytedance.ts @@ -14,7 +14,6 @@ import { LLMModel, MultimodalContent, SpeechOptions, - TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -82,9 +81,6 @@ export class DoubaoApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index c8d3658b350..ecb5ce44b57 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -6,7 +6,6 @@ import { LLMModel, LLMUsage, SpeechOptions, - TranscriptionOptions, } from "../api"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; import { getClientConfig } from "@/app/config/client"; @@ -67,9 +66,7 @@ export class GeminiProApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } + async chat(options: ChatOptions): Promise { const apiClient = this; let multimodal = false; diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts index 6463e052e40..bd0c6083809 100644 --- a/app/client/platforms/iflytek.ts +++ b/app/client/platforms/iflytek.ts @@ -13,7 +13,6 @@ import { LLMApi, LLMModel, SpeechOptions, - TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -63,9 +62,6 @@ export class SparkApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts index 173ecd14c9d..bf8f18751d2 100644 --- a/app/client/platforms/moonshot.ts +++ b/app/client/platforms/moonshot.ts @@ -27,7 +27,6 @@ import { LLMUsage, MultimodalContent, SpeechOptions, - TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -77,9 +76,6 @@ export class MoonshotApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts index 71b7731fa0b..70b191f5c1d 100644 --- a/app/client/platforms/openai.ts +++ b/app/client/platforms/openai.ts @@ -34,7 +34,6 @@ import { LLMUsage, MultimodalContent, SpeechOptions, - TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -187,47 +186,6 @@ export class ChatGPTApi implements LLMApi { } } - async transcription(options: TranscriptionOptions): Promise { - const formData = new FormData(); - formData.append("file", options.file, "audio.wav"); - formData.append("model", options.model ?? "whisper-1"); - if (options.language) formData.append("language", options.language); - if (options.prompt) formData.append("prompt", options.prompt); - if (options.response_format) - formData.append("response_format", options.response_format); - if (options.temperature) - formData.append("temperature", options.temperature.toString()); - - console.log("[Request] openai audio transcriptions payload: ", options); - - const controller = new AbortController(); - options.onController?.(controller); - - try { - const path = this.path(OpenaiPath.TranscriptionPath, options.model); - const headers = getHeaders(true); - const payload = { - method: "POST", - body: formData, - signal: controller.signal, - headers: headers, - }; - - // make a fetch request - const requestTimeoutId = setTimeout( - () => controller.abort(), - REQUEST_TIMEOUT_MS, - ); - const res = await fetch(path, payload); - clearTimeout(requestTimeoutId); - const json = await res.json(); - return json.text; - } catch (e) { - console.log("[Request] failed to make a audio transcriptions request", e); - throw e; - } - } - async chat(options: ChatOptions) { const modelConfig = { ...useAppConfig.getState().modelConfig, diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts index 1739b7a142b..3e8f1a45957 100644 --- a/app/client/platforms/tencent.ts +++ b/app/client/platforms/tencent.ts @@ -9,7 +9,6 @@ import { LLMModel, MultimodalContent, SpeechOptions, - TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -94,9 +93,6 @@ export class HunyuanApi implements LLMApi { speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } - transcription(options: TranscriptionOptions): Promise { - throw new Error("Method not implemented."); - } async chat(options: ChatOptions) { const visionModel = isVisionModel(options.config.model); diff --git a/app/components/chat.tsx b/app/components/chat.tsx index cb03440775e..59ffa01c1ce 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -10,7 +10,6 @@ import React, { } from "react"; import SendWhiteIcon from "../icons/send-white.svg"; -import VoiceWhiteIcon from "../icons/voice-white.svg"; import BrainIcon from "../icons/brain.svg"; import RenameIcon from "../icons/rename.svg"; import ExportIcon from "../icons/share.svg"; @@ -83,7 +82,7 @@ import dynamic from "next/dynamic"; import { ChatControllerPool } from "../client/controller"; import { DalleSize, DalleQuality, DalleStyle } from "../typing"; import { Prompt, usePromptStore } from "../store/prompt"; -import Locale, { getLang, getSTTLang } from "../locales"; +import Locale from "../locales"; import { IconButton } from "./button"; import styles from "./chat.module.scss"; @@ -100,9 +99,7 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, - DEFAULT_STT_ENGINE, DEFAULT_TTS_ENGINE, - FIREFOX_DEFAULT_STT_ENGINE, ModelProvider, LAST_INPUT_KEY, Path, @@ -123,11 +120,6 @@ import { MultimodalContent } from "../client/api"; const localStorage = safeLocalStorage(); import { ClientApi } from "../client/api"; import { createTTSPlayer } from "../utils/audio"; -import { - OpenAITranscriptionApi, - SpeechApi, - WebTranscriptionApi, -} from "../utils/speech"; import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; const ttsPlayer = createTTSPlayer(); @@ -556,44 +548,6 @@ export function ChatActions(props: { } }, [chatStore, currentModel, models]); - const [isListening, setIsListening] = useState(false); - const [isTranscription, setIsTranscription] = useState(false); - const [speechApi, setSpeechApi] = useState(null); - - useEffect(() => { - if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE; - setSpeechApi( - config.sttConfig.engine === DEFAULT_STT_ENGINE - ? new WebTranscriptionApi((transcription) => - onRecognitionEnd(transcription), - ) - : new OpenAITranscriptionApi((transcription) => - onRecognitionEnd(transcription), - ), - ); - }, []); - - const startListening = async () => { - if (speechApi) { - await speechApi.start(); - setIsListening(true); - } - }; - const stopListening = async () => { - if (speechApi) { - if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) - setIsTranscription(true); - await speechApi.stop(); - setIsListening(false); - } - }; - const onRecognitionEnd = (finalTranscript: string) => { - console.log(finalTranscript); - if (finalTranscript) props.setUserInput(finalTranscript); - if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) - setIsTranscription(false); - }; - return (
{couldStop && ( @@ -828,16 +782,6 @@ export function ChatActions(props: { icon={} /> )} - - {config.sttConfig.enable && ( - - isListening ? await stopListening() : await startListening() - } - text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak} - icon={} - /> - )}
); } diff --git a/app/components/settings.tsx b/app/components/settings.tsx index 47a72d79de7..33bf8b2e74e 100644 --- a/app/components/settings.tsx +++ b/app/components/settings.tsx @@ -81,7 +81,6 @@ import { nanoid } from "nanoid"; import { useMaskStore } from "../store/mask"; import { ProviderType } from "../utils/cloud"; import { TTSConfigList } from "./tts-config"; -import { STTConfigList } from "./stt-config"; function EditPromptModal(props: { id: string; onClose: () => void }) { const promptStore = usePromptStore(); @@ -1659,17 +1658,6 @@ export function Settings() { /> - - { - const sttConfig = { ...config.sttConfig }; - updater(sttConfig); - config.update((config) => (config.sttConfig = sttConfig)); - }} - /> - - diff --git a/app/components/stt-config.tsx b/app/components/stt-config.tsx deleted file mode 100644 index f83d280305f..00000000000 --- a/app/components/stt-config.tsx +++ /dev/null @@ -1,51 +0,0 @@ -import { STTConfig, STTConfigValidator } from "../store"; - -import Locale from "../locales"; -import { ListItem, Select } from "./ui-lib"; -import { DEFAULT_STT_ENGINES } from "../constant"; -import { isFirefox } from "../utils"; - -export function STTConfigList(props: { - sttConfig: STTConfig; - updateConfig: (updater: (config: STTConfig) => void) => void; -}) { - return ( - <> - - - props.updateConfig( - (config) => (config.enable = e.currentTarget.checked), - ) - } - > - - {!isFirefox() && ( - - - - )} - - ); -} diff --git a/app/components/stt.module.scss b/app/components/stt.module.scss deleted file mode 100644 index ba9f382e40b..00000000000 --- a/app/components/stt.module.scss +++ /dev/null @@ -1,119 +0,0 @@ -@import "../styles/animation.scss"; -.plugin-page { - height: 100%; - display: flex; - flex-direction: column; - - .plugin-page-body { - padding: 20px; - overflow-y: auto; - - .plugin-filter { - width: 100%; - max-width: 100%; - margin-bottom: 20px; - animation: slide-in ease 0.3s; - height: 40px; - - display: flex; - - .search-bar { - flex-grow: 1; - max-width: 100%; - min-width: 0; - outline: none; - } - - .search-bar:focus { - border: 1px solid var(--primary); - } - - .plugin-filter-lang { - height: 100%; - margin-left: 10px; - } - - .plugin-create { - height: 100%; - margin-left: 10px; - box-sizing: border-box; - min-width: 80px; - } - } - - .plugin-item { - display: flex; - justify-content: space-between; - padding: 20px; - border: var(--border-in-light); - animation: slide-in ease 0.3s; - - &:not(:last-child) { - border-bottom: 0; - } - - &:first-child { - border-top-left-radius: 10px; - border-top-right-radius: 10px; - } - - &:last-child { - border-bottom-left-radius: 10px; - border-bottom-right-radius: 10px; - } - - .plugin-header { - display: flex; - align-items: center; - - .plugin-icon { - display: flex; - align-items: center; - justify-content: center; - margin-right: 10px; - } - - .plugin-title { - .plugin-name { - font-size: 14px; - font-weight: bold; - } - .plugin-info { - font-size: 12px; - } - .plugin-runtime-warning { - font-size: 12px; - color: #f86c6c; - } - } - } - - .plugin-actions { - display: flex; - flex-wrap: nowrap; - transition: all ease 0.3s; - justify-content: center; - align-items: center; - } - - @media screen and (max-width: 600px) { - display: flex; - flex-direction: column; - padding-bottom: 10px; - border-radius: 10px; - margin-bottom: 20px; - box-shadow: var(--card-shadow); - - &:not(:last-child) { - border-bottom: var(--border-in-light); - } - - .plugin-actions { - width: 100%; - justify-content: space-between; - padding-top: 10px; - } - } - } - } -} diff --git a/app/constant.ts b/app/constant.ts index d349268baad..7d7fcf78b7c 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -153,7 +153,6 @@ export const Anthropic = { export const OpenaiPath = { ChatPath: "v1/chat/completions", SpeechPath: "v1/audio/speech", - TranscriptionPath: "v1/audio/transcriptions", ImagePath: "v1/images/generations", UsagePath: "dashboard/billing/usage", SubsPath: "dashboard/billing/subscription", @@ -274,10 +273,6 @@ export const DEFAULT_TTS_VOICES = [ "shimmer", ]; -export const DEFAULT_STT_ENGINE = "WebAPI"; -export const DEFAULT_STT_ENGINES = ["WebAPI", "OpenAI Whisper"]; -export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper"; - const openaiModels = [ "gpt-3.5-turbo", "gpt-3.5-turbo-1106", diff --git a/app/locales/cn.ts b/app/locales/cn.ts index 05e33049199..979485a0029 100644 --- a/app/locales/cn.ts +++ b/app/locales/cn.ts @@ -520,16 +520,6 @@ const cn = { SubTitle: "生成语音的速度", }, }, - STT: { - Enable: { - Title: "启用语音转文本", - SubTitle: "启用语音转文本", - }, - Engine: { - Title: "转换引擎", - SubTitle: "音频转换引擎", - }, - }, }, Store: { DefaultTopic: "新的聊天", diff --git a/app/locales/en.ts b/app/locales/en.ts index 0c2d2d27d5a..4bf065033eb 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -527,16 +527,6 @@ const en: LocaleType = { }, Engine: "TTS Engine", }, - STT: { - Enable: { - Title: "Enable STT", - SubTitle: "Enable Speech-to-Text", - }, - Engine: { - Title: "STT Engine", - SubTitle: "Text-to-Speech Engine", - }, - }, }, Store: { DefaultTopic: "New Conversation", diff --git a/app/store/config.ts b/app/store/config.ts index e2a2f874733..39268c69b8f 100644 --- a/app/store/config.ts +++ b/app/store/config.ts @@ -5,8 +5,6 @@ import { DEFAULT_INPUT_TEMPLATE, DEFAULT_MODELS, DEFAULT_SIDEBAR_WIDTH, - DEFAULT_STT_ENGINE, - DEFAULT_STT_ENGINES, DEFAULT_TTS_ENGINE, DEFAULT_TTS_ENGINES, DEFAULT_TTS_MODEL, @@ -23,8 +21,6 @@ export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number]; export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number]; export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number]; -export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number]; - export enum SubmitKey { Enter = "Enter", CtrlEnter = "Ctrl + Enter", @@ -90,17 +86,12 @@ export const DEFAULT_CONFIG = { voice: DEFAULT_TTS_VOICE, speed: 1.0, }, - sttConfig: { - enable: false, - engine: DEFAULT_STT_ENGINE, - }, }; export type ChatConfig = typeof DEFAULT_CONFIG; export type ModelConfig = ChatConfig["modelConfig"]; export type TTSConfig = ChatConfig["ttsConfig"]; -export type STTConfig = ChatConfig["sttConfig"]; export function limitNumber( x: number, @@ -130,12 +121,6 @@ export const TTSConfigValidator = { }, }; -export const STTConfigValidator = { - engine(x: string) { - return x as STTEngineType; - }, -}; - export const ModalConfigValidator = { model(x: string) { return x as ModelType; diff --git a/app/utils/speech.ts b/app/utils/speech.ts deleted file mode 100644 index dc8102879fb..00000000000 --- a/app/utils/speech.ts +++ /dev/null @@ -1,126 +0,0 @@ -import { ChatGPTApi } from "../client/platforms/openai"; -import { getSTTLang } from "../locales"; -import { isFirefox } from "../utils"; - -export type TranscriptionCallback = (transcription: string) => void; - -export abstract class SpeechApi { - protected onTranscription: TranscriptionCallback = () => {}; - - abstract isListening(): boolean; - abstract start(): Promise; - abstract stop(): Promise; - - onTranscriptionReceived(callback: TranscriptionCallback) { - this.onTranscription = callback; - } -} - -export class OpenAITranscriptionApi extends SpeechApi { - private listeningStatus = false; - private mediaRecorder: MediaRecorder | null = null; - private stream: MediaStream | null = null; - private audioChunks: Blob[] = []; - - isListening = () => this.listeningStatus; - - constructor(transcriptionCallback?: TranscriptionCallback) { - super(); - if (transcriptionCallback) { - this.onTranscriptionReceived(transcriptionCallback); - } - } - - async start(): Promise { - // @ts-ignore - navigator.getUserMedia = - // @ts-ignore - navigator.getUserMedia || - // @ts-ignore - navigator.webkitGetUserMedia || - // @ts-ignore - navigator.mozGetUserMedia || - // @ts-ignore - navigator.msGetUserMedia; - if (navigator.mediaDevices) { - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - this.mediaRecorder = new MediaRecorder(stream); - this.mediaRecorder.ondataavailable = (e) => { - if (e.data && e.data.size > 0) { - this.audioChunks.push(e.data); - } - }; - - this.stream = stream; - } else { - console.warn("Media Decives will work only with SSL"); - return; - } - - this.audioChunks = []; - - // this.recorder.addEventListener("dataavailable", (event) => { - // this.audioChunks.push(event.data); - // }); - - this.mediaRecorder.start(1000); - this.listeningStatus = true; - } - - async stop(): Promise { - if (!this.mediaRecorder || !this.listeningStatus) { - return; - } - - return new Promise((resolve) => { - this.mediaRecorder!.addEventListener("stop", async () => { - const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" }); - const llm = new ChatGPTApi(); - const transcription = await llm.transcription({ file: audioBlob }); - this.onTranscription(transcription); - this.listeningStatus = false; - resolve(); - }); - - this.mediaRecorder!.stop(); - }); - } -} - -export class WebTranscriptionApi extends SpeechApi { - private listeningStatus = false; - private recognitionInstance: any | null = null; - - isListening = () => this.listeningStatus; - - constructor(transcriptionCallback?: TranscriptionCallback) { - super(); - if (isFirefox()) return; - const SpeechRecognition = - (window as any).SpeechRecognition || - (window as any).webkitSpeechRecognition; - this.recognitionInstance = new SpeechRecognition(); - this.recognitionInstance.continuous = true; - this.recognitionInstance.interimResults = true; - this.recognitionInstance.lang = getSTTLang(); - if (transcriptionCallback) { - this.onTranscriptionReceived(transcriptionCallback); - } - this.recognitionInstance.onresult = (event: any) => { - const result = event.results[event.results.length - 1]; - if (result.isFinal) { - this.onTranscription(result[0].transcript); - } - }; - } - - async start(): Promise { - this.listeningStatus = true; - await this.recognitionInstance.start(); - } - - async stop(): Promise { - this.listeningStatus = false; - await this.recognitionInstance.stop(); - } -}