From 3fca11c4f72c3d6e33a2061156396fdf03b0beb2 Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 25 Nov 2024 01:01:53 -0500 Subject: [PATCH] com.rest.elevenlabs 3.4.0 (#100) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - com.utilities.rest -> 3.3.0 - com.utilities.encoder.ogg -> 4.0.2 - Added additional request properties for TextToSpeechRequest - `previous_text`, `next_text`, `previous_request_ids`, `next_request_ids`, `languageCode`, `withTimestamps` - `cacheFormat` which can be `None`, `Wav`, or `Ogg` - Added support for transcription timestamps by @tomkail - Added support for language code in TextToSpeechRequest @Mylan719 - Refactored `VoiceClip` - clip samples and data are now prioritized over the `AudioClip` - audioClip will not be created until you access the `VoiceClip.AudioClip` property - if an audio clip is not loaded, you can load it with `LoadCachedAudioClipAsync` - Refactored demo scene to use `OnAudioFilterRead` to better quality stream playback --------- Co-authored-by: Milan Mikuš Co-authored-by: Milan Mikuš Co-authored-by: Tom Kail Co-authored-by: Tom Kail --- .github/workflows/unity.yml | 16 +- ElevenLabs/.editorconfig | 3 + .../Documentation~/README.md | 41 +- .../Editor/ElevenLabsDashboard.cs | 6 +- .../Runtime/Common/CacheFormat.cs | 11 + ...Extensions.cs.meta => CacheFormat.cs.meta} | 2 +- .../Runtime/Common/GeneratedClip.cs | 81 ++- .../Runtime/Common/OutputFormatExtensions.cs | 24 - .../Common/TimestampedTranscriptCharacter.cs | 44 ++ .../TimestampedTranscriptCharacter.cs.meta | 11 + .../Runtime/Common/VoiceClip.cs | 12 +- .../Runtime/Dubbing/DubbingEndpoint.cs | 2 - .../Runtime/Extensions/Extensions.cs | 16 + .../Runtime/Extensions/Extensions.cs.meta | 11 + .../Runtime/History/HistoryEndpoint.cs | 17 +- .../Runtime/TextToSpeech/Alignment.cs | 52 ++ .../Runtime/TextToSpeech/Alignment.cs.meta | 11 + .../TextToSpeech/TextToSpeechEndpoint.cs | 599 ++++++++++-------- .../TextToSpeech/TextToSpeechRequest.cs | 107 +++- .../TextToSpeech/TranscriptionResponse.cs | 30 + .../TranscriptionResponse.cs.meta | 11 + .../Runtime/Voices/Voice.cs | 2 +- .../Runtime/Voices/VoicesEndpoint.cs | 13 +- .../TextToSpeech/ElevenLabs.Demo.asmdef | 3 +- .../Samples~/TextToSpeech/TextToSpeechDemo.cs | 77 ++- .../TextToSpeech/TextToSpeechDemo.unity | 30 +- .../Test_Fixture_04_TextToSpeechEndpoint.cs | 92 ++- .../Packages/com.rest.elevenlabs/package.json | 8 +- ElevenLabs/Packages/manifest.json | 4 +- ElevenLabs/ProjectSettings/ProjectVersion.txt | 4 +- README.md | 41 +- 31 files changed, 926 insertions(+), 455 deletions(-) create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs rename ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/{OutputFormatExtensions.cs.meta => CacheFormat.cs.meta} (86%) delete mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs.meta create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs.meta create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs.meta create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs create mode 100644 ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs.meta diff --git a/.github/workflows/unity.yml b/.github/workflows/unity.yml index c8a4036..1872865 100644 --- a/.github/workflows/unity.yml +++ b/.github/workflows/unity.yml @@ -11,26 +11,24 @@ on: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: ${{ ( github.event_name == 'pull_request' || github.event.action == 'synchronize' ) }} -permissions: - checks: write - pull-requests: write + cancel-in-progress: ${{ (github.event_name == 'pull_request' || github.event.action == 'synchronize') }} jobs: build: - env: - UNITY_PROJECT_PATH: '' + permissions: + checks: write + pull-requests: write runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-13] + os: [ubuntu-latest, windows-latest, macos-15] unity-versions: [2021.x, 2022.x, 6000.x] include: - os: ubuntu-latest build-target: StandaloneLinux64 - os: windows-latest build-target: StandaloneWindows64 - - os: macos-13 + - os: macos-15 build-target: StandaloneOSX steps: - uses: actions/checkout@v4 @@ -46,11 +44,13 @@ jobs: - uses: RageAgainstThePixel/unity-action@v1 name: '${{ matrix.build-target }}-Validate' with: + build-target: ${{ matrix.build-target }} log-name: '${{ matrix.build-target }}-Validate' args: '-quit -nographics -batchmode -executeMethod Utilities.Editor.BuildPipeline.UnityPlayerBuildTools.ValidateProject -importTMProEssentialsAsset' - uses: RageAgainstThePixel/unity-action@v1 name: '${{ matrix.build-target }}-Build' with: + build-target: ${{ matrix.build-target }} log-name: '${{ matrix.build-target }}-Build' args: '-quit -nographics -batchmode -executeMethod Utilities.Editor.BuildPipeline.UnityPlayerBuildTools.StartCommandLineBuild' - uses: actions/upload-artifact@v4 diff --git a/ElevenLabs/.editorconfig b/ElevenLabs/.editorconfig index ac4472f..caa477f 100644 --- a/ElevenLabs/.editorconfig +++ b/ElevenLabs/.editorconfig @@ -34,6 +34,9 @@ dotnet_style_predefined_type_for_locals_parameters_members = true # Code Style csharp_style_var_when_type_is_apparent = true +dotnet_diagnostic.IDE0051.severity = none +dotnet_diagnostic.CS0649.severity = none + #### Resharper/Rider Rules #### # https://www.jetbrains.com/help/resharper/EditorConfig_Properties.html diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Documentation~/README.md b/ElevenLabs/Packages/com.rest.elevenlabs/Documentation~/README.md index 75937d6..57ccf1c 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Documentation~/README.md +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Documentation~/README.md @@ -40,6 +40,7 @@ The recommended installation method is though the unity package manager and [Ope - [com.utilities.extensions](https://github.com/RageAgainstThePixel/com.utilities.extensions) - [com.utilities.audio](https://github.com/RageAgainstThePixel/com.utilities.audio) - [com.utilities.encoder.ogg](https://github.com/RageAgainstThePixel/com.utilities.encoder.ogg) + - [com.utilities.encoder.wav](https://github.com/RageAgainstThePixel/com.utilities.encoder.wav) - [com.utilities.rest](https://github.com/RageAgainstThePixel/com.utilities.rest) --- @@ -59,7 +60,7 @@ The recommended installation method is though the unity package manager and [Ope - [Text to Speech](#text-to-speech) - [Stream Text To Speech](#stream-text-to-speech) - [Voices](#voices) - - [Get Shared Voices](#get-shared-voices) :new: + - [Get Shared Voices](#get-shared-voices) - [Get All Voices](#get-all-voices) - [Get Default Voice Settings](#get-default-voice-settings) - [Get Voice](#get-voice) @@ -70,13 +71,13 @@ The recommended installation method is though the unity package manager and [Ope - [Samples](#samples) - [Download Voice Sample](#download-voice-sample) - [Delete Voice Sample](#delete-voice-sample) -- [Dubbing](#dubbing) :new: - - [Dub](#dub) :new: - - [Get Dubbing Metadata](#get-dubbing-metadata) :new: - - [Get Transcript for Dub](#get-transcript-for-dub) :new: - - [Get dubbed file](#get-dubbed-file) :new: - - [Delete Dubbing Project](#delete-dubbing-project) :new: -- [SFX Generation](#sfx-generation) :new: +- [Dubbing](#dubbing) + - [Dub](#dub) + - [Get Dubbing Metadata](#get-dubbing-metadata) + - [Get Transcript for Dub](#get-transcript-for-dub) + - [Get dubbed file](#get-dubbed-file) + - [Delete Dubbing Project](#delete-dubbing-project) +- [SFX Generation](#sfx-generation) - [History](#history) - [Get History](#get-history) - [Get History Item](#get-history-item) @@ -265,8 +266,8 @@ Convert text to speech. var api = new ElevenLabsClient(); var text = "The quick brown fox jumps over the lazy dog."; var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); -var defaultVoiceSettings = await api.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); -var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(text, voice, defaultVoiceSettings); +var request = new TextToSpeechRequest(voice, text); +var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request); audioSource.PlayOneShot(voiceClip.AudioClip); ``` @@ -284,18 +285,14 @@ Stream text to speech. var api = new ElevenLabsClient(); var text = "The quick brown fox jumps over the lazy dog."; var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); -var partialClips = new Queue(); -var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync( - text, - voice, - partialClip => - { - // Note: Best to queue them and play them in update loop! - // See TextToSpeech sample demo for details - partialClips.Enqueue(partialClip); - }); -// The full completed clip: -audioSource.clip = voiceClip.AudioClip; +var partialClips = new Queue(); +var request = new TextToSpeechRequest(voice, message, model: Model.EnglishTurboV2, outputFormat: OutputFormat.PCM_44100); +var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(request, partialClip => +{ + // Note: check demo scene for best practices + // on how to handle playback with OnAudioFilterRead + partialClips.Enqueue(partialClip); +}); ``` ### [Voices](https://docs.elevenlabs.io/api-reference/voices) diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs index 0797c00..61fc154 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs @@ -1106,7 +1106,7 @@ private async void GenerateSynthesizedText() Directory.CreateDirectory(downloadDir); } - voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(speechSynthesisTextInput, currentVoiceOption, currentVoiceSettings, currentModelOption); + voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(new(currentVoiceOption, speechSynthesisTextInput, voiceSettings: currentVoiceSettings, model: currentModelOption)); voiceClip.CopyIntoProject(editorDownloadDirectory); } catch (Exception e) @@ -1225,7 +1225,7 @@ private void RenderVoiceLab() EditorGUILayout.Space(EndWidth); EditorGUILayout.EndHorizontal(); EditorGUI.indentLevel++; - + EditorGUILayout.BeginHorizontal(); { EditorGUILayout.LabelField(voice.Id, EditorStyles.boldLabel); @@ -1242,7 +1242,7 @@ private void RenderVoiceLab() EditorGUILayout.Space(EndWidth); EditorGUILayout.EndHorizontal(); EditorGUI.indentLevel++; - + if (!voiceLabels.TryGetValue(voice.Id, out var cachedLabels)) { cachedLabels = new Dictionary(); diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs new file mode 100644 index 0000000..7e09111 --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs @@ -0,0 +1,11 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +namespace ElevenLabs +{ + public enum CacheFormat + { + None, + Ogg, + Wav + } +} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs.meta b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs.meta similarity index 86% rename from ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs.meta rename to ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs.meta index f21b387..820d7d1 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs.meta +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/CacheFormat.cs.meta @@ -1,5 +1,5 @@ fileFormatVersion: 2 -guid: 91fcef5ec7d363740b7a354d26952229 +guid: 477e464b1c1991e439b1a2a3c46ed761 MonoImporter: externalObjects: {} serializedVersion: 2 diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs index 1965774..8676262 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs @@ -2,8 +2,12 @@ using ElevenLabs.Extensions; using System; +using System.Threading; +using System.Threading.Tasks; using UnityEngine; using UnityEngine.Scripting; +using Utilities.Audio; +using Utilities.WebRequestRest; namespace ElevenLabs { @@ -12,16 +16,30 @@ namespace ElevenLabs public class GeneratedClip : ISerializationCallbackReceiver { [Preserve] - internal GeneratedClip(string id, string text, AudioClip audioClip, string cachedPath) + internal GeneratedClip(string id, string text, AudioClip audioClip, string cachedPath = null) { this.id = id; this.text = text; TextHash = $"{id}{text}".GenerateGuid(); textHash = TextHash.ToString(); - this.audioClip = audioClip; this.cachedPath = cachedPath; + SampleRate = audioClip.frequency; } + [Preserve] + internal GeneratedClip(string id, string text, ReadOnlyMemory clipData, int sampleRate, string cachedPath = null) + { + this.id = id; + this.text = text; + TextHash = $"{id}{text}".GenerateGuid(); + textHash = TextHash.ToString(); + this.cachedPath = cachedPath; + ClipData = clipData; + SampleRate = sampleRate; + } + + private readonly ReadOnlyMemory audioData; + [SerializeField] private string id; @@ -44,7 +62,25 @@ internal GeneratedClip(string id, string text, AudioClip audioClip, string cache private AudioClip audioClip; [Preserve] - public AudioClip AudioClip => audioClip; + public AudioClip AudioClip + { + get + { + if (audioClip == null && !audioData.IsEmpty) + { + var pcmData = PCMEncoder.Decode(audioData.ToArray()); + audioClip = AudioClip.Create(Id, pcmData.Length, 1, SampleRate, false); + audioClip.SetData(pcmData, 0); + } + + if (audioClip == null) + { + Debug.LogError($"{nameof(audioClip)} is null, try loading it with LoadCachedAudioClipAsync"); + } + + return audioClip; + } + } [SerializeField] private string cachedPath; @@ -52,8 +88,47 @@ internal GeneratedClip(string id, string text, AudioClip audioClip, string cache [Preserve] public string CachedPath => cachedPath; + public ReadOnlyMemory ClipData { get; } + + private float[] clipSamples; + + public float[] ClipSamples + { + get + { + if (!ClipData.IsEmpty) + { + clipSamples ??= PCMEncoder.Decode(ClipData.ToArray()); + } + else if (audioClip != null) + { + clipSamples = new float[audioClip.samples]; + audioClip.GetData(clipSamples, 0); + } + + return clipSamples; + } + } + + public int SampleRate { get; } + public void OnBeforeSerialize() => textHash = TextHash.ToString(); public void OnAfterDeserialize() => TextHash = Guid.Parse(textHash); + + public static implicit operator AudioClip(GeneratedClip clip) => clip?.AudioClip; + + public async Task LoadCachedAudioClipAsync(CancellationToken cancellationToken = default) + { + var audioType = cachedPath switch + { + var path when path.EndsWith(".ogg") => AudioType.OGGVORBIS, + var path when path.EndsWith(".wav") => AudioType.WAV, + var path when path.EndsWith(".mp3") => AudioType.MPEG, + _ => AudioType.UNKNOWN + }; + + return await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, cancellationToken: cancellationToken); + } } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs deleted file mode 100644 index e8b4729..0000000 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/OutputFormatExtensions.cs +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed under the MIT License. See LICENSE in the project root for license information. - -using System; -using UnityEngine; - -namespace ElevenLabs -{ - public static class OutputFormatExtensions - { - public static AudioType GetAudioType(this OutputFormat format) - => format switch - { - OutputFormat.MP3_44100_64 => AudioType.MPEG, - OutputFormat.MP3_44100_96 => AudioType.MPEG, - OutputFormat.MP3_44100_128 => AudioType.MPEG, - OutputFormat.MP3_44100_192 => AudioType.MPEG, - OutputFormat.PCM_16000 => AudioType.OGGVORBIS, - OutputFormat.PCM_22050 => AudioType.OGGVORBIS, - OutputFormat.PCM_24000 => AudioType.OGGVORBIS, - OutputFormat.PCM_44100 => AudioType.OGGVORBIS, - _ => throw new ArgumentOutOfRangeException(nameof(format), format, null) - }; - } -} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs new file mode 100644 index 0000000..543208a --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs @@ -0,0 +1,44 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using Newtonsoft.Json; +using UnityEngine.Scripting; + +namespace ElevenLabs +{ + /// + /// Represents timing information for a single character in the transcript + /// + [Preserve] + public class TimestampedTranscriptCharacter + { + [Preserve] + [JsonConstructor] + internal TimestampedTranscriptCharacter(string character, double startTime, double endTime) + { + Character = character; + StartTime = startTime; + EndTime = endTime; + } + + /// + /// The character being spoken + /// + [Preserve] + [JsonProperty("character")] + public string Character { get; } + + /// + /// The time in seconds when this character starts being spoken + /// + [Preserve] + [JsonProperty("character_start_times_seconds")] + public double StartTime { get; } + + /// + /// The time in seconds when this character finishes being spoken + /// + [Preserve] + [JsonProperty("character_end_times_seconds")] + public double EndTime { get; } + } +} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs.meta b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs.meta new file mode 100644 index 0000000..dc6646f --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/TimestampedTranscriptCharacter.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 569bb5fe3607d4d18b6bcb3e200f1107 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 5b71cbcaf078a8e44a5c96dcd24376d5, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/VoiceClip.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/VoiceClip.cs index 24803d8..71d3bea 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/VoiceClip.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/VoiceClip.cs @@ -12,16 +12,26 @@ namespace ElevenLabs public sealed class VoiceClip : GeneratedClip { [Preserve] - internal VoiceClip(string id, string text, Voice voice, AudioClip audioClip, string cachedPath) + internal VoiceClip(string id, string text, Voice voice, AudioClip audioClip, string cachedPath = null) : base(id, text, audioClip, cachedPath) { this.voice = voice; } + [Preserve] + internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory clipData, int sampleRate, string cachedPath = null) + : base(id, text, clipData, sampleRate, cachedPath) + { + this.voice = voice; + } + [SerializeField] private Voice voice; [Preserve] public Voice Voice => voice; + + [Preserve] + public TimestampedTranscriptCharacter[] TimestampedTranscriptCharacters { get; internal set; } } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs index 0228dc2..c22d3cc 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs @@ -7,11 +7,9 @@ using System.Diagnostics; using System.Globalization; using System.IO; -using System.Linq; using System.Threading; using System.Threading.Tasks; using UnityEngine; -using UnityEngine.Networking; using Utilities.WebRequestRest; using Debug = UnityEngine.Debug; diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs new file mode 100644 index 0000000..c6bf1a1 --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs @@ -0,0 +1,16 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +namespace ElevenLabs.Extensions +{ + public static class Extensions + { + public static int GetSampleRate(this OutputFormat format) => format switch + { + OutputFormat.PCM_16000 => 16000, + OutputFormat.PCM_22050 => 22050, + OutputFormat.PCM_24000 => 24000, + OutputFormat.PCM_44100 => 44100, + _ => 44100 + }; + } +} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs.meta b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs.meta new file mode 100644 index 0000000..4f7b194 --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Extensions/Extensions.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 9679edc3d5fee9e4a892f93a65389087 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 5b71cbcaf078a8e44a5c96dcd24376d5, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/History/HistoryEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/History/HistoryEndpoint.cs index 7841626..50d6ba0 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/History/HistoryEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/History/HistoryEndpoint.cs @@ -11,8 +11,7 @@ using System.Threading.Tasks; using UnityEngine; using Utilities.Async; -using Utilities.Audio; -using Utilities.Encoding.OggVorbis; +using Utilities.Encoding.Wav; using Utilities.WebRequestRest; namespace ElevenLabs.History @@ -82,11 +81,11 @@ public async Task DownloadHistoryAudioAsync(HistoryItem historyItem, .CreateNewDirectory(nameof(History)) .CreateNewDirectory(historyItem.VoiceId); var voice = await client.VoicesEndpoint.GetVoiceAsync(historyItem.VoiceId, cancellationToken: cancellationToken); - var audioType = historyItem.ContentType.Contains("mpeg") ? AudioType.MPEG : AudioType.OGGVORBIS; + var audioType = historyItem.ContentType.Contains("mpeg") ? AudioType.MPEG : AudioType.WAV; var extension = audioType switch { AudioType.MPEG => "mp3", - AudioType.OGGVORBIS => "ogg", + AudioType.WAV => "wav", _ => throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}") }; var cachedPath = Path.Combine(voiceDirectory, $"{historyItem.Id}.{extension}"); @@ -101,11 +100,9 @@ public async Task DownloadHistoryAudioAsync(HistoryItem historyItem, case AudioType.MPEG: await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(false); break; - case AudioType.OGGVORBIS: - var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit); - var sampleRate = 44100; // TODO unknown sample rate. - var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, sampleRate, 1, cancellationToken: cancellationToken).ConfigureAwait(false); - await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken).ConfigureAwait(false); + case AudioType.WAV: + var sampleRate = 44100; // TODO unknown sample rate. how do we figure it out? + await WavEncoder.WriteToFileAsync(cachedPath, response.Data, 1, sampleRate, cancellationToken: cancellationToken); break; default: throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}"); @@ -113,7 +110,7 @@ public async Task DownloadHistoryAudioAsync(HistoryItem historyItem, } await Awaiters.UnityMainThread; - var audioClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.UNKNOWN, parameters: new RestParameters(debug: EnableDebug), cancellationToken: cancellationToken); + var audioClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, parameters: new RestParameters(debug: EnableDebug), cancellationToken: cancellationToken); return new VoiceClip(historyItem.Id, historyItem.Text, voice, audioClip, cachedPath); } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs new file mode 100644 index 0000000..9971f8c --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs @@ -0,0 +1,52 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using Newtonsoft.Json; +using UnityEngine.Scripting; + +namespace ElevenLabs.TextToSpeech +{ + [Preserve] + internal sealed class Alignment + { + [Preserve] + [JsonConstructor] + internal Alignment( + [JsonProperty("characters")] string[] characters, + [JsonProperty("character_start_times_seconds")] double[] startTimes, + [JsonProperty("character_end_times_seconds")] double[] endTimes) + { + Characters = characters; + StartTimes = startTimes; + EndTimes = endTimes; + } + + [Preserve] + [JsonProperty("characters")] + public string[] Characters { get; } + + [Preserve] + [JsonProperty("character_start_times_seconds")] + public double[] StartTimes { get; } + + [Preserve] + [JsonProperty("character_end_times_seconds")] + public double[] EndTimes { get; } + + [Preserve] + public static implicit operator TimestampedTranscriptCharacter[](Alignment alignment) + { + if (alignment == null) { return null; } + var characters = alignment.Characters; + var startTimes = alignment.StartTimes; + var endTimes = alignment.EndTimes; + var timestampedTranscriptCharacters = new TimestampedTranscriptCharacter[characters.Length]; + + for (var i = 0; i < characters.Length; i++) + { + timestampedTranscriptCharacters[i] = new TimestampedTranscriptCharacter(characters[i], startTimes[i], endTimes[i]); + } + + return timestampedTranscriptCharacters; + } + } +} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs.meta b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs.meta new file mode 100644 index 0000000..f45af4c --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/Alignment.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: fb7cd218df63a7746b866b1182146ab4 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 5b71cbcaf078a8e44a5c96dcd24376d5, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs index 5032a63..ddb8669 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs @@ -1,268 +1,331 @@ -// Licensed under the MIT License. See LICENSE in the project root for license information. - -using ElevenLabs.Extensions; -using ElevenLabs.Models; -using ElevenLabs.Voices; -using Newtonsoft.Json; -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; -using System.Threading; -using System.Threading.Tasks; -using UnityEngine; -using Utilities.Async; -using Utilities.Audio; -using Utilities.Encoding.OggVorbis; -using Utilities.WebRequestRest; - -namespace ElevenLabs.TextToSpeech -{ - /// - /// Access to convert text to synthesized speech. - /// - public sealed class TextToSpeechEndpoint : ElevenLabsBaseEndPoint - { - private const string HistoryItemId = "history-item-id"; - private const string OutputFormatParameter = "output_format"; - private const string OptimizeStreamingLatencyParameter = "optimize_streaming_latency"; - - public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { } - - protected override string Root => "text-to-speech"; - - /// - /// Converts text into speech using a voice of your choice and returns audio. - /// - /// - /// Text input to synthesize speech for. Maximum 5000 characters. - /// - /// - /// to use. - /// - /// - /// Optional, that will override the default settings in . - /// - /// - /// Optional, to use. Defaults to . - /// - /// - /// Output format of the generated audio.
- /// Defaults to - /// - /// - /// Optional, You can turn on latency optimizations at some cost of quality. - /// The best possible final latency varies by model.
- /// Possible values:
- /// 0 - default mode (no latency optimizations)
- /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)
- /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
- /// 3 - max latency optimizations
- /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings - /// (best latency, but can mispronounce eg numbers and dates). - /// - /// Optional, . - /// . - public async Task TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default) - => await TextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), outputFormat, optimizeStreamingLatency, model), cancellationToken); - - /// - /// Converts text into speech using a voice of your choice and returns audio. - /// - /// . - /// Optional, . - /// . - public async Task TextToSpeechAsync(TextToSpeechRequest request, CancellationToken cancellationToken = default) - { - var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions); - var parameters = new Dictionary - { - { OutputFormatParameter, request.OutputFormat.ToString().ToLower() } - }; - - if (request.OptimizeStreamingLatency.HasValue) - { - parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString()); - } - - var response = await Rest.PostAsync(GetUrl($"/{request.Voice.Id}", parameters), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken); - response.Validate(EnableDebug); - - if (!response.Headers.TryGetValue(HistoryItemId, out var clipId)) - { - throw new ArgumentException("Failed to parse clip id!"); - } - - var audioType = request.OutputFormat.GetAudioType(); - var extension = audioType switch - { - AudioType.MPEG => "mp3", - AudioType.OGGVORBIS => "ogg", - _ => throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}") - }; - var downloadDirectory = await GetCacheDirectoryAsync(request.Voice); - var cachedPath = $"{downloadDirectory}/{clipId}.{extension}"; - - if (!File.Exists(cachedPath)) - { - switch (audioType) - { - case AudioType.MPEG: - await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(false); - break; - case AudioType.OGGVORBIS: - var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit); - var frequency = request.OutputFormat switch - { - OutputFormat.PCM_16000 => 16000, - OutputFormat.PCM_22050 => 22050, - OutputFormat.PCM_24000 => 24000, - OutputFormat.PCM_44100 => 44100, - _ => throw new ArgumentOutOfRangeException(nameof(request.OutputFormat), request.OutputFormat, null) - }; - var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false); - await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken).ConfigureAwait(false); - break; - default: - throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}"); - } - } - - await Awaiters.UnityMainThread; - var audioClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, parameters: new RestParameters(debug: EnableDebug), cancellationToken: cancellationToken); - return new VoiceClip(clipId, request.Text, request.Voice, audioClip, cachedPath); - } - - /// - /// Converts text into speech using a voice of your choice and returns audio as an audio stream. - /// - /// - /// Text input to synthesize speech for. Maximum 5000 characters. - /// - /// - /// to use. - /// - /// - /// Optional, Callback to enable streaming audio as it comes in.
- /// Returns partial . - /// - /// - /// Optional, that will override the default settings in . - /// - /// - /// Optional, to use. Defaults to . - /// - /// - /// Output format of the generated audio.
- /// Note: Must be PCM format to stream audio in Unity!
- /// Defaults to . - /// - /// - /// Optional, You can turn on latency optimizations at some cost of quality. - /// The best possible final latency varies by model.
- /// Possible values:
- /// 0 - default mode (no latency optimizations)
- /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)
- /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
- /// 3 - max latency optimizations
- /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings - /// (best latency, but can mispronounce eg numbers and dates). - /// - /// - /// Optional, . - /// - /// Downloaded clip path, and the loaded audio clip. - public async Task StreamTextToSpeechAsync(string text, Voice voice, Action partialClipCallback, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.PCM_24000, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default) - => await StreamTextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), outputFormat, optimizeStreamingLatency, model), partialClipCallback, cancellationToken); - - /// - /// Converts text into speech using a voice of your choice and returns audio as an audio stream. - /// - /// . - /// - /// Optional, Callback to enable streaming audio as it comes in.
- /// Returns partial . - /// - /// - /// Optional, . - /// - /// Downloaded clip path, and the loaded audio clip. - public async Task StreamTextToSpeechAsync(TextToSpeechRequest request, Action partialClipCallback, CancellationToken cancellationToken = default) - { - var frequency = request.OutputFormat switch - { - OutputFormat.MP3_44100_64 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"), - OutputFormat.MP3_44100_96 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"), - OutputFormat.MP3_44100_128 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"), - OutputFormat.MP3_44100_192 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"), - OutputFormat.PCM_16000 => 16000, - OutputFormat.PCM_22050 => 22050, - OutputFormat.PCM_24000 => 24000, - OutputFormat.PCM_44100 => 44100, - _ => throw new ArgumentOutOfRangeException(nameof(request.OutputFormat), request.OutputFormat, null) - }; - var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions); - var parameters = new Dictionary - { - { OutputFormatParameter, request.OutputFormat.ToString().ToLower() } - }; - - if (request.OptimizeStreamingLatency.HasValue) - { - parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString()); - } - - var part = 0; - var response = await Rest.PostAsync(GetUrl($"/{request.Voice.Id}/stream", parameters), payload, StreamCallback, eventChunkSize: 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken).ConfigureAwait(true); - response.Validate(EnableDebug); - - if (!response.Headers.TryGetValue(HistoryItemId, out var clipId)) - { - throw new ArgumentException("Failed to parse clip id!"); - } - - var pcmData = PCMEncoder.Decode(response.Data); - var downloadDirectory = await GetCacheDirectoryAsync(request.Voice); - var cachedPath = $"{downloadDirectory}/{clipId}.ogg"; - var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken); - await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken); - var fullClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, parameters: new RestParameters(debug: EnableDebug), compressed: false, streamingAudio: true, cancellationToken: cancellationToken); - return new VoiceClip(clipId, request.Text, request.Voice, fullClip, cachedPath); - - void StreamCallback(Response partialResponse) - { - try - { - if (!partialResponse.Headers.TryGetValue(HistoryItemId, out clipId)) - { - throw new ArgumentException("Failed to parse clip id!"); - } - - var chunk = PCMEncoder.Decode(partialResponse.Data); - var audioClip = AudioClip.Create($"{clipId}_{++part}", chunk.Length, 1, frequency, false); - - if (!audioClip.SetData(chunk, 0)) - { - Debug.LogError("Failed to set pcm data to partial clip."); - return; - } - - partialClipCallback.Invoke(audioClip); - } - catch (Exception e) - { - Debug.LogError(e); - } - } - } - - private static async Task GetCacheDirectoryAsync(Voice voice) - { - await Rest.ValidateCacheDirectoryAsync(); - return Rest.DownloadCacheDirectory - .CreateNewDirectory(nameof(ElevenLabs)) - .CreateNewDirectory(nameof(TextToSpeech)) - .CreateNewDirectory(voice.Id); - } - } -} +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using ElevenLabs.Extensions; +using ElevenLabs.Models; +using ElevenLabs.Voices; +using Newtonsoft.Json; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using UnityEngine; +using Utilities.Audio; +using Utilities.Encoding.OggVorbis; +using Utilities.Encoding.Wav; +using Utilities.WebRequestRest; + +namespace ElevenLabs.TextToSpeech +{ + /// + /// Access to convert text to synthesized speech. + /// + public sealed class TextToSpeechEndpoint : ElevenLabsBaseEndPoint + { + private const string HistoryItemId = "history-item-id"; + private const string OutputFormatParameter = "output_format"; + private const string OptimizeStreamingLatencyParameter = "optimize_streaming_latency"; + + public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { } + + protected override string Root => "text-to-speech"; + + [Obsolete("use overload with TextToSpeechRequest")] + public async Task TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default) + => await TextToSpeechAsync( + new TextToSpeechRequest( + voice, + text, + Encoding.UTF8, + voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), + outputFormat, + optimizeStreamingLatency, + model), + cancellationToken); + + [Obsolete("use TextToSpeechAsync with VoiceClip partialClipCallback")] + public async Task StreamTextToSpeechAsync(string text, Voice voice, Action partialClipCallback, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.PCM_24000, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default) + => await StreamTextToSpeechAsync( + new TextToSpeechRequest( + voice, + text, + Encoding.UTF8, + voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), + outputFormat, + optimizeStreamingLatency, + model), + partialClipCallback, cancellationToken); + + [Obsolete("use TextToSpeechAsync with VoiceClip partialClipCallback")] + public async Task StreamTextToSpeechAsync(TextToSpeechRequest request, Action partialClipCallback, CancellationToken cancellationToken = default) + => await TextToSpeechAsync(request, voiceClip => + { + partialClipCallback.Invoke(voiceClip.AudioClip); + }, cancellationToken); + + /// + /// Converts text to synthesized speech. + /// + /// . + /// Optional, . + /// . + public async Task TextToSpeechAsync(TextToSpeechRequest request, CancellationToken cancellationToken = default) + { + request.VoiceSettings ??= await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken); + var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions); + var parameters = CreateRequestParameters(request); + var endpoint = $"/{request.Voice}"; + + if (request.WithTimestamps) + { + endpoint += "/with-timestamps"; + } + + var response = await Rest.PostAsync(GetUrl(endpoint, parameters), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken); + response.Validate(EnableDebug); + + if (!response.Headers.TryGetValue(HistoryItemId, out var clipId)) + { + throw new ArgumentException("Failed to parse clip id!"); + } + + byte[] audioData; + TimestampedTranscriptCharacter[] transcriptionCharacters = null; + + if (request.WithTimestamps) + { + var transcriptResponse = JsonConvert.DeserializeObject(response.Body, ElevenLabsClient.JsonSerializationOptions); + audioData = transcriptResponse.AudioBytes; + transcriptionCharacters = transcriptResponse.Alignment; + } + else + { + audioData = response.Data; + } + + string cachedPath = null; + + if (request.CacheFormat != CacheFormat.None) + { + cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true); + } + + return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioData), request.OutputFormat.GetSampleRate(), cachedPath) + { + TimestampedTranscriptCharacters = transcriptionCharacters + }; + } + + /// + /// Converts text to synthesized speech. + /// + /// . + /// Partial callback with streaming data. + /// Optional, . + /// . + public async Task TextToSpeechAsync(TextToSpeechRequest request, Action partialClipCallback, CancellationToken cancellationToken = default) + { + if (request.OutputFormat is not OutputFormat.PCM_16000 and not OutputFormat.PCM_22050 and not OutputFormat.PCM_24000 and not OutputFormat.PCM_44100) + { + Debug.LogWarning($"{nameof(request.OutputFormat)} must be a PCM format! defaulting to 24000"); + request.OutputFormat = OutputFormat.PCM_24000; + } + + request.VoiceSettings ??= await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken); + + var frequency = request.OutputFormat.GetSampleRate(); + var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions); + var parameters = CreateRequestParameters(request); + var endpoint = $"/{request.Voice.Id}/stream"; + + var part = 0; + StringBuilder textBuffer; + List accumulatedPCMData = null; + List accumulatedTranscriptData = null; + Action streamCallback; + + if (request.WithTimestamps) + { + endpoint += "/with-timestamps"; + textBuffer = new StringBuilder(); + accumulatedPCMData = new List(); + accumulatedTranscriptData = new List(); + streamCallback = TranscriptionStreamCallback; + } + else + { + streamCallback = StreamCallback; + } + + var response = await Rest.PostAsync(GetUrl(endpoint, parameters), payload, streamCallback, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken); + response.Validate(EnableDebug); + + if (!response.Headers.TryGetValue(HistoryItemId, out var clipId)) + { + throw new ArgumentException("Failed to parse clip id!"); + } + + var audioData = request.WithTimestamps ? accumulatedPCMData!.ToArray() : response.Data; + string cachedPath = null; + + if (request.CacheFormat != CacheFormat.None) + { + cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true); + } + + return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioData), request.OutputFormat.GetSampleRate(), cachedPath) + { + TimestampedTranscriptCharacters = accumulatedTranscriptData?.ToArray() ?? Array.Empty() + }; + + void StreamCallback(Response partialResponse) + { + try + { + if (!partialResponse.Headers.TryGetValue(HistoryItemId, out clipId)) + { + throw new ArgumentException("Failed to parse clip id!"); + } + + partialClipCallback.Invoke(new VoiceClip($"{clipId}_{++part}", request.Text, request.Voice, new ReadOnlyMemory(partialResponse.Data), frequency)); + } + catch (Exception e) + { + Debug.LogError(e); + } + } + + void TranscriptionStreamCallback(Response partialResponse) + { + try + { + if (!partialResponse.Headers.TryGetValue(HistoryItemId, out clipId)) + { + throw new ArgumentException("Failed to parse clip id!"); + } + + var chunkText = Encoding.UTF8.GetString(partialResponse.Data); + textBuffer.Append(chunkText); + + // Process any complete lines + var text = textBuffer.ToString(); + var lines = text.Split('\n'); + + // Keep the last potentially incomplete line + textBuffer.Clear(); + textBuffer.Append(lines[^1]); + + // Process all complete lines + for (var i = 0; i < lines.Length - 1; i++) + { + ProcessTranscribedVoiceClip(lines[i].Trim()); + } + } + catch (Exception e) + { + Debug.LogError(e); + } + } + + void ProcessTranscribedVoiceClip(string line) + { + if (string.IsNullOrEmpty(line)) { return; } + + try + { + var partialTranscription = JsonConvert.DeserializeObject(line, ElevenLabsClient.JsonSerializationOptions); + var timestampedTranscriptCharacters = (TimestampedTranscriptCharacter[])partialTranscription.Alignment ?? Array.Empty(); + + try + { + partialClipCallback.Invoke(new VoiceClip($"{clipId}_{++part}", request.Text, request.Voice, new ReadOnlyMemory(partialTranscription.AudioBytes), frequency) + { + TimestampedTranscriptCharacters = timestampedTranscriptCharacters + }); + } + finally + { + accumulatedPCMData.AddRange(partialTranscription.AudioBytes); + accumulatedTranscriptData.AddRange(timestampedTranscriptCharacters); + } + } + catch (JsonReaderException e) + { + Debug.LogWarning($"Failed to parse line as JSON: {e.Message}"); + } + } + } + + private static Dictionary CreateRequestParameters(TextToSpeechRequest request) + { + var parameters = new Dictionary + { + { OutputFormatParameter, request.OutputFormat.ToString().ToLower() } + }; + + if (request.OptimizeStreamingLatency.HasValue) + { + parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString()); + } + + return parameters; + } + + private static async Task SaveAudioToCache(byte[] audioData, string clipId, Voice voice, OutputFormat outputFormat, CacheFormat cacheFormat, CancellationToken cancellationToken) + { + string extension; + AudioType audioType; + + if (outputFormat is OutputFormat.MP3_44100_64 or OutputFormat.MP3_44100_96 or OutputFormat.MP3_44100_128 or OutputFormat.MP3_44100_128) + { + extension = "mp3"; + audioType = AudioType.MPEG; + } + else + { + switch (cacheFormat) + { + case CacheFormat.Wav: + extension = "wav"; + audioType = AudioType.WAV; + break; + case CacheFormat.Ogg: + extension = "ogg"; + audioType = AudioType.OGGVORBIS; + break; + case CacheFormat.None: + default: + throw new ArgumentOutOfRangeException(nameof(cacheFormat), cacheFormat, null); + } + } + + await Rest.ValidateCacheDirectoryAsync(); + var downloadDirectory = Rest.DownloadCacheDirectory + .CreateNewDirectory(nameof(ElevenLabs)) + .CreateNewDirectory(nameof(TextToSpeech)) + .CreateNewDirectory(voice.Id); + var cachedPath = $"{downloadDirectory}/{clipId}.{extension}"; + + if (!File.Exists(cachedPath)) + { + switch (audioType) + { + case AudioType.MPEG: + await File.WriteAllBytesAsync(cachedPath, audioData, cancellationToken).ConfigureAwait(false); + break; + case AudioType.OGGVORBIS: + var oggBytes = await OggEncoder.ConvertToBytesAsync(PCMEncoder.Decode(audioData), sampleRate: outputFormat.GetSampleRate(), channels: 1, cancellationToken: cancellationToken).ConfigureAwait(false); + await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken).ConfigureAwait(false); + break; + case AudioType.WAV: + await WavEncoder.WriteToFileAsync(cachedPath, audioData, sampleRate: outputFormat.GetSampleRate(), channels: 1, cancellationToken: cancellationToken).ConfigureAwait(false); + break; + } + } + + return cachedPath; + } + + } +} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechRequest.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechRequest.cs index 0beb36f..5ab224a 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechRequest.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechRequest.cs @@ -12,6 +12,47 @@ namespace ElevenLabs.TextToSpeech [Preserve] public sealed class TextToSpeechRequest { + /// + /// Constructor. + /// + /// + /// to use. + /// + /// + /// Text input to synthesize speech for. Maximum 5000 characters. + /// + /// to use for . + /// + /// Optional, that will override the default settings in . + /// + /// + /// Optional, to use. Defaults to . + /// + /// + /// Output format of the generated audio.
+ /// Defaults to + /// + /// + /// Optional, You can turn on latency optimizations at some cost of quality. + /// The best possible final latency varies by model.
+ /// Possible values:
+ /// 0 - default mode (no latency optimizations)
+ /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)
+ /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
+ /// 3 - max latency optimizations
+ /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings + /// (best latency, but can mispronounce e.g. numbers and dates). + /// + /// + /// + /// + /// + /// + /// Optional, Language code (ISO 639-1) used to enforce a language for the model. Currently only supports language enforcement. + /// For other models, an error will be returned if language code is provided. + /// + /// + /// [Preserve] public TextToSpeechRequest( Voice voice, @@ -21,7 +62,13 @@ public TextToSpeechRequest( OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Model model = null, - string previousText = null) + string previousText = null, + string nextText = null, + string[] previousRequestIds = null, + string[] nextRequestIds = null, + string languageCode = null, + CacheFormat cacheFormat = CacheFormat.Wav, + bool withTimestamps = false) { if (string.IsNullOrWhiteSpace(text)) { @@ -45,12 +92,26 @@ public TextToSpeechRequest( } Text = text; - Model = model ?? Models.Model.MultiLingualV2; - Voice = voice; - VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings)); - PreviousText = previousText; + Model = model ?? Models.Model.TurboV2_5; + Voice = string.IsNullOrWhiteSpace(voice) ? Voice.Adam : voice; + VoiceSettings = voiceSettings ?? voice.Settings; OutputFormat = outputFormat; OptimizeStreamingLatency = optimizeStreamingLatency; + PreviousText = previousText; + NextText = nextText; + if (previousRequestIds?.Length > 3) + { + previousRequestIds = previousRequestIds[..3]; + } + PreviousRequestIds = previousRequestIds; + if (nextRequestIds?.Length > 3) + { + nextRequestIds = nextRequestIds[..3]; + } + NextRequestIds = nextRequestIds; + LanguageCode = languageCode; + CacheFormat = cacheFormat; + WithTimestamps = withTimestamps; } [Preserve] @@ -70,15 +131,45 @@ public TextToSpeechRequest( public VoiceSettings VoiceSettings { get; internal set; } [Preserve] - [JsonProperty("previous_text")] - public string PreviousText { get; } + [JsonIgnore] + public OutputFormat OutputFormat { get; internal set; } [Preserve] [JsonIgnore] - public OutputFormat OutputFormat { get; } + public CacheFormat CacheFormat { get; internal set; } [Preserve] [JsonIgnore] public int? OptimizeStreamingLatency { get; } + + [Preserve] + [JsonProperty("previous_text")] + public string PreviousText { get; } + + [Preserve] + [JsonProperty("next_text")] + public string NextText { get; } + + /// + /// A maximum of three next or previous history item ids can be sent + /// + [Preserve] + [JsonProperty("previous_request_ids")] + public string[] PreviousRequestIds { get; } + + /// + /// A maximum of three next or previous history item ids can be sent + /// + [Preserve] + [JsonProperty("next_request_ids")] + public string[] NextRequestIds { get; } + + [Preserve] + [JsonProperty("language_code")] + public string LanguageCode { get; } + + [Preserve] + [JsonIgnore] + public bool WithTimestamps { get; } } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs new file mode 100644 index 0000000..52176a4 --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs @@ -0,0 +1,30 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using Newtonsoft.Json; +using UnityEngine.Scripting; + +namespace ElevenLabs.TextToSpeech +{ + [Preserve] + internal sealed class TranscriptionResponse + { + [Preserve] + [JsonConstructor] + public TranscriptionResponse( + [JsonProperty("audio_base64")] string audioBase64, + [JsonProperty("alignment")] Alignment alignment) + { + AudioBase64 = audioBase64; + Alignment = alignment; + } + + [JsonProperty("audio_base64")] + public string AudioBase64 { get; } + + [JsonIgnore] + public byte[] AudioBytes => System.Convert.FromBase64String(AudioBase64); + + [JsonProperty("alignment")] + public Alignment Alignment { get; } + } +} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs.meta b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs.meta new file mode 100644 index 0000000..4653e23 --- /dev/null +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TranscriptionResponse.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 317e44226b6324a47bc7b2dde458e978 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 5b71cbcaf078a8e44a5c96dcd24376d5, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs index 1bcc405..206e73e 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs @@ -94,7 +94,7 @@ public string Id public VoiceSettings Settings { get; internal set; } [Preserve] - public static implicit operator string(Voice voice) => voice.ToString(); + public static implicit operator string(Voice voice) => voice?.ToString(); [Preserve] public override string ToString() => Id; diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/VoicesEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/VoicesEndpoint.cs index ad67c0a..0f8b142 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/VoicesEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/VoicesEndpoint.cs @@ -11,8 +11,7 @@ using UnityEngine; using UnityEngine.Scripting; using Utilities.Async; -using Utilities.Audio; -using Utilities.Encoding.OggVorbis; +using Utilities.Encoding.Wav; using Utilities.WebRequestRest; namespace ElevenLabs.Voices @@ -329,11 +328,11 @@ public async Task DownloadVoiceSampleAudioAsync(Voice voice, Sample s .CreateNewDirectory(voice.Id) .CreateNewDirectory("Samples"); // TODO possibly handle other types? - var audioType = sample.MimeType.Contains("mpeg") ? AudioType.MPEG : AudioType.OGGVORBIS; + var audioType = sample.MimeType.Contains("mpeg") ? AudioType.MPEG : AudioType.WAV; var extension = audioType switch { AudioType.MPEG => "mp3", - AudioType.OGGVORBIS => "ogg", + AudioType.WAV => "wav", _ => throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}") }; var cachedPath = Path.Combine(downloadDirectory, $"{sample.Id}.{extension}"); @@ -348,11 +347,9 @@ public async Task DownloadVoiceSampleAudioAsync(Voice voice, Sample s case AudioType.MPEG: await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(false); break; - case AudioType.OGGVORBIS: - var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit); + case AudioType.WAV: var sampleRate = 44100; // TODO unknown sample rate. - var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, sampleRate, 1, cancellationToken: cancellationToken).ConfigureAwait(false); - await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken).ConfigureAwait(false); + await WavEncoder.WriteToFileAsync(cachedPath, response.Data, 1, sampleRate, cancellationToken: cancellationToken).ConfigureAwait(false); break; default: throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}"); diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/ElevenLabs.Demo.asmdef b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/ElevenLabs.Demo.asmdef index 9e57626..37adb79 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/ElevenLabs.Demo.asmdef +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/ElevenLabs.Demo.asmdef @@ -4,7 +4,8 @@ "references": [ "GUID:85797d0f100db6c43bf118660781167b", "GUID:a6609af893242c7438d701ddd4cce46a", - "GUID:7958db66189566541a6363568aee1575" + "GUID:7958db66189566541a6363568aee1575", + "GUID:d25c28436b1dcc9408d86f49a0f5210b" ], "includePlatforms": [], "excludePlatforms": [], diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs index 0e13968..c3fc85d 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs @@ -1,13 +1,16 @@ // Licensed under the MIT License. See LICENSE in the project root for license information. using ElevenLabs.Models; +using ElevenLabs.TextToSpeech; using ElevenLabs.Voices; using System; using System.Collections.Generic; using System.Linq; using System.Threading; +using System.Threading.Tasks; using UnityEngine; using Utilities.Async; +using Utilities.Audio; namespace ElevenLabs.Demo { @@ -30,10 +33,11 @@ public class TextToSpeechDemo : MonoBehaviour [SerializeField] private AudioSource audioSource; - private readonly Queue streamClipQueue = new(); + private readonly Queue sampleQueue = new(); #if !UNITY_2022_3_OR_NEWER private readonly CancellationTokenSource lifetimeCts = new(); + // ReSharper disable once InconsistentNaming private CancellationToken destroyCancellationToken => lifetimeCts.Token; #endif @@ -61,16 +65,22 @@ private async void Start() voice = (await api.VoicesEndpoint.GetAllVoicesAsync(destroyCancellationToken)).FirstOrDefault(); } - streamClipQueue.Clear(); - var streamQueueCts = CancellationTokenSource.CreateLinkedTokenSource(destroyCancellationToken); - PlayStreamQueue(streamQueueCts.Token); - var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(message, voice, partialClip => + + sampleQueue.Clear(); + var request = new TextToSpeechRequest(voice, message, model: Model.EnglishTurboV2, outputFormat: OutputFormat.PCM_24000); + var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request, partialClip => { - streamClipQueue.Enqueue(partialClip); - }, model: Model.EnglishTurboV2, cancellationToken: destroyCancellationToken); + const int sampleRate = 44100; // default unity audio clip sample rate + // we have to resample the partial clip to the unity audio clip sample rate. Ideally use PCM_44100 + var resampled = PCMEncoder.Resample(partialClip.ClipSamples, partialClip.SampleRate, sampleRate); + foreach (var sample in resampled) + { + sampleQueue.Enqueue(sample); + } + }, cancellationToken: destroyCancellationToken); audioSource.clip = voiceClip.AudioClip; - await new WaitUntil(() => streamClipQueue.Count == 0 && !audioSource.isPlaying); - streamQueueCts.Cancel(); + await new WaitUntil(() => sampleQueue.Count == 0 || destroyCancellationToken.IsCancellationRequested); + destroyCancellationToken.ThrowIfCancellationRequested(); if (debug) { @@ -79,41 +89,38 @@ private async void Start() } catch (Exception e) { - Debug.LogError(e); + switch (e) + { + case TaskCanceledException: + case OperationCanceledException: + break; + default: + Debug.LogException(e); + break; + } } } -#if !UNITY_2022_3_OR_NEWER - private void OnDestroy() + private void OnAudioFilterRead(float[] data, int channels) { - lifetimeCts.Cancel(); - lifetimeCts.Dispose(); - } -#endif + if (sampleQueue.Count <= 0) { return; } - private async void PlayStreamQueue(CancellationToken cancellationToken) - { - try + for (var i = 0; i < data.Length; i += channels) { - await new WaitUntil(() => streamClipQueue.Count > 0); - var endOfFrame = new WaitForEndOfFrame(); + var sample = sampleQueue.Dequeue(); - do + for (var j = 0; j < channels; j++) { - if (!audioSource.isPlaying && - streamClipQueue.TryDequeue(out var clip)) - { - Debug.Log($"playing partial clip: {clip.name}"); - audioSource.PlayOneShot(clip); - } - - await endOfFrame; - } while (!cancellationToken.IsCancellationRequested); - } - catch (Exception e) - { - Debug.LogError(e); + data[i + j] = sample; + } } } + +#if !UNITY_2022_3_OR_NEWER + private void OnDestroy() + { + lifetimeCts.Cancel(); + } +#endif } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.unity b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.unity index 8f63880..a46f34b 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.unity +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.unity @@ -38,7 +38,6 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.44657898, g: 0.4964133, b: 0.5748178, a: 1} m_UseRadianceAmbientProbe: 0 --- !u!157 &3 LightmapSettings: @@ -104,7 +103,7 @@ NavMeshSettings: serializedVersion: 2 m_ObjectHideFlags: 0 m_BuildSettings: - serializedVersion: 3 + serializedVersion: 2 agentTypeID: 0 agentRadius: 0.5 agentHeight: 2 @@ -117,7 +116,7 @@ NavMeshSettings: cellSize: 0.16666667 manualTileSize: 0 tileSize: 256 - buildHeightMesh: 0 + accuratePlacement: 0 maxJobWorkers: 0 preserveTilesOutsideBounds: 0 debug: @@ -209,13 +208,13 @@ Transform: m_PrefabInstance: {fileID: 0} m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 541418635} - serializedVersion: 2 m_LocalRotation: {x: 0.40821788, y: -0.23456968, z: 0.10938163, w: 0.8754261} m_LocalPosition: {x: 0, y: 3, z: 0} m_LocalScale: {x: 1, y: 1, z: 1} m_ConstrainProportionsScale: 1 m_Children: [] m_Father: {fileID: 0} + m_RootOrder: 1 m_LocalEulerAnglesHint: {x: 50, y: -30, z: 0} --- !u!1 &1232673082 GameObject: @@ -242,13 +241,13 @@ Transform: m_PrefabInstance: {fileID: 0} m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 1232673082} - serializedVersion: 2 m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} m_LocalPosition: {x: 0, y: 0, z: 0} m_LocalScale: {x: 1, y: 1, z: 1} m_ConstrainProportionsScale: 1 m_Children: [] m_Father: {fileID: 0} + m_RootOrder: 0 m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} --- !u!114 &1232673085 MonoBehaviour: @@ -262,6 +261,8 @@ MonoBehaviour: m_Script: {fileID: 11500000, guid: b1dff7556fcd4ac4d8514517487cdbb3, type: 3} m_Name: m_EditorClassIdentifier: + configuration: {fileID: 0} + debug: 1 voice: name: Bella id: EXAVITQu4vr4xnSDxMaL @@ -410,17 +411,9 @@ Camera: m_projectionMatrixMode: 1 m_GateFitMode: 2 m_FOVAxisMode: 0 - m_Iso: 200 - m_ShutterSpeed: 0.005 - m_Aperture: 16 - m_FocusDistance: 10 - m_FocalLength: 50 - m_BladeCount: 5 - m_Curvature: {x: 2, y: 11} - m_BarrelClipping: 0.25 - m_Anamorphism: 0 m_SensorSize: {x: 36, y: 24} m_LensShift: {x: 0, y: 0} + m_FocalLength: 50 m_NormalizedViewPortRect: serializedVersion: 2 x: 0 @@ -454,18 +447,11 @@ Transform: m_PrefabInstance: {fileID: 0} m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 1655310072} - serializedVersion: 2 m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} m_LocalPosition: {x: 0, y: 1, z: -10} m_LocalScale: {x: 1, y: 1, z: 1} m_ConstrainProportionsScale: 1 m_Children: [] m_Father: {fileID: 0} + m_RootOrder: 2 m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} ---- !u!1660057539 &9223372036854775807 -SceneRoots: - m_ObjectHideFlags: 0 - m_Roots: - - {fileID: 1655310075} - - {fileID: 541418637} - - {fileID: 1232673084} diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs index a772e66..8355144 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs @@ -1,5 +1,6 @@ -// Licensed under the MIT License. See LICENSE in the project root for license information. +// Licensed under the MIT License. See LICENSE in the project root for license information. +using ElevenLabs.TextToSpeech; using NUnit.Framework; using System.Collections.Generic; using System.Linq; @@ -14,10 +15,10 @@ internal class Test_Fixture_04_TextToSpeechEndpoint : AbstractTestFixture public async Task Test_01_TextToSpeech() { Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); - var voice = Voices.Voice.Adam; + var voice = (await ElevenLabsClient.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); Assert.NotNull(voice); - var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); - var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog."); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); Assert.NotNull(voiceClip); Assert.NotNull(voiceClip.AudioClip); Debug.Log(voiceClip.Id); @@ -29,16 +30,91 @@ public async Task Test_02_StreamTextToSpeech() Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); var voice = (await ElevenLabsClient.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); Assert.NotNull(voice); - var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); var partialClips = new Queue(); - var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.StreamTextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, - clip => partialClips.Enqueue(clip), - defaultVoiceSettings); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, voiceClip => partialClips.Enqueue(voiceClip)); Assert.NotNull(partialClips); Assert.IsNotEmpty(partialClips); Assert.NotNull(voiceClip); Assert.IsNotNull(voiceClip.AudioClip); Debug.Log(voiceClip.Id); + Debug.Log(voiceClip.CachedPath); + } + + [Test] + public async Task Test_03_TextToSpeech_Transcription() + { + Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); + var voice = (await ElevenLabsClient.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); + Assert.NotNull(voice); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", withTimestamps: true); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); + Assert.NotNull(voiceClip); + Assert.NotNull(voiceClip.AudioClip); + Debug.Log(voiceClip.Id); + Debug.Log(voiceClip.CachedPath); + Assert.NotNull(voiceClip.TimestampedTranscriptCharacters); + Assert.IsNotEmpty(voiceClip.TimestampedTranscriptCharacters); + Debug.Log("| Character | Start Time | End Time |"); + Debug.Log("| --------- | ---------- | -------- |"); + foreach (var character in voiceClip.TimestampedTranscriptCharacters) + { + Debug.Log($"| {character.Character} | {character.StartTime} | {character.EndTime} |"); + } + } + + [Test] + public async Task Test_04_StreamTextToSpeech_Transcription() + { + Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); + var voice = Voices.Voice.Adam; + Assert.NotNull(voice); + voice.Settings ??= await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); + var partialClips = new Queue(); + var characters = new Queue(); + Debug.Log("| Character | Start Time | End Time |"); + Debug.Log("| --------- | ---------- | -------- |"); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000, withTimestamps: true); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, voiceClip => + { + partialClips.Enqueue(voiceClip.AudioClip); + foreach (var character in voiceClip.TimestampedTranscriptCharacters) + { + characters.Enqueue(character); + Debug.Log($"| {character.Character} | {character.StartTime} | {character.EndTime} |"); + } + }); + Assert.NotNull(partialClips); + Assert.NotNull(partialClips); + Assert.IsNotEmpty(partialClips); + Assert.NotNull(voiceClip); + Assert.IsNotNull(voiceClip.AudioClip); + Debug.Log(voiceClip.Id); + Debug.Log(voiceClip.CachedPath); + Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters); + } + + [Test] + public async Task Test_05_LanguageEnforced_TextToSpeech() + { + Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); + var voice = Voices.Voice.Adam; + Assert.NotNull(voice); + var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); + var request = new TextToSpeechRequest( + voice: voice, + text: "Příliš žluťoučký kůň úpěl ďábelské ódy", + voiceSettings: defaultVoiceSettings, + model: Models.Model.TurboV2_5, + outputFormat: OutputFormat.MP3_44100_192, + cacheFormat: CacheFormat.None, + languageCode: "cs"); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); + Assert.NotNull(voiceClip); + Assert.NotNull(voiceClip.AudioClip); + Assert.IsTrue(string.IsNullOrWhiteSpace(voiceClip.CachedPath)); + Debug.Log(voiceClip.Id); + Debug.Log(voiceClip.CachedPath); } } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/package.json b/ElevenLabs/Packages/com.rest.elevenlabs/package.json index d606fd4..84a4e69 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/package.json +++ b/ElevenLabs/Packages/com.rest.elevenlabs/package.json @@ -3,7 +3,7 @@ "displayName": "ElevenLabs", "description": "A non-official Eleven Labs voice synthesis RESTful client.", "keywords": [], - "version": "3.3.1", + "version": "3.4.0", "unity": "2021.3", "documentationUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs#documentation", "changelogUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs/releases", @@ -17,9 +17,9 @@ "url": "https://github.com/StephenHodgson" }, "dependencies": { - "com.utilities.rest": "2.5.7", - "com.utilities.encoder.ogg": "3.1.4", - "com.utilities.encoder.wav": "1.2.2" + "com.utilities.rest": "3.3.0", + "com.utilities.encoder.ogg": "4.0.1", + "com.utilities.encoder.wav": "2.0.2" }, "samples": [ { diff --git a/ElevenLabs/Packages/manifest.json b/ElevenLabs/Packages/manifest.json index 7f0c52d..e81262e 100644 --- a/ElevenLabs/Packages/manifest.json +++ b/ElevenLabs/Packages/manifest.json @@ -1,9 +1,9 @@ { "dependencies": { - "com.unity.ide.rider": "3.0.31", + "com.unity.ide.rider": "3.0.34", "com.unity.ide.visualstudio": "2.0.22", "com.unity.test-framework": "1.3.5", - "com.utilities.buildpipeline": "1.5.0" + "com.utilities.buildpipeline": "1.5.7" }, "scopedRegistries": [ { diff --git a/ElevenLabs/ProjectSettings/ProjectVersion.txt b/ElevenLabs/ProjectSettings/ProjectVersion.txt index 9ba13fd..8386a05 100644 --- a/ElevenLabs/ProjectSettings/ProjectVersion.txt +++ b/ElevenLabs/ProjectSettings/ProjectVersion.txt @@ -1,2 +1,2 @@ -m_EditorVersion: 2021.3.42f1 -m_EditorVersionWithRevision: 2021.3.42f1 (f1197811e8ce) +m_EditorVersion: 2021.3.45f1 +m_EditorVersionWithRevision: 2021.3.45f1 (0da89fac8e79) diff --git a/README.md b/README.md index 71adcd7..9b6b5ae 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ The recommended installation method is though the unity package manager and [Ope - [com.utilities.extensions](https://github.com/RageAgainstThePixel/com.utilities.extensions) - [com.utilities.audio](https://github.com/RageAgainstThePixel/com.utilities.audio) - [com.utilities.encoder.ogg](https://github.com/RageAgainstThePixel/com.utilities.encoder.ogg) + - [com.utilities.encoder.wav](https://github.com/RageAgainstThePixel/com.utilities.encoder.wav) - [com.utilities.rest](https://github.com/RageAgainstThePixel/com.utilities.rest) --- @@ -59,7 +60,7 @@ The recommended installation method is though the unity package manager and [Ope - [Text to Speech](#text-to-speech) - [Stream Text To Speech](#stream-text-to-speech) - [Voices](#voices) - - [Get Shared Voices](#get-shared-voices) :new: + - [Get Shared Voices](#get-shared-voices) - [Get All Voices](#get-all-voices) - [Get Default Voice Settings](#get-default-voice-settings) - [Get Voice](#get-voice) @@ -70,13 +71,13 @@ The recommended installation method is though the unity package manager and [Ope - [Samples](#samples) - [Download Voice Sample](#download-voice-sample) - [Delete Voice Sample](#delete-voice-sample) -- [Dubbing](#dubbing) :new: - - [Dub](#dub) :new: - - [Get Dubbing Metadata](#get-dubbing-metadata) :new: - - [Get Transcript for Dub](#get-transcript-for-dub) :new: - - [Get dubbed file](#get-dubbed-file) :new: - - [Delete Dubbing Project](#delete-dubbing-project) :new: -- [SFX Generation](#sfx-generation) :new: +- [Dubbing](#dubbing) + - [Dub](#dub) + - [Get Dubbing Metadata](#get-dubbing-metadata) + - [Get Transcript for Dub](#get-transcript-for-dub) + - [Get dubbed file](#get-dubbed-file) + - [Delete Dubbing Project](#delete-dubbing-project) +- [SFX Generation](#sfx-generation) - [History](#history) - [Get History](#get-history) - [Get History Item](#get-history-item) @@ -265,8 +266,8 @@ Convert text to speech. var api = new ElevenLabsClient(); var text = "The quick brown fox jumps over the lazy dog."; var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); -var defaultVoiceSettings = await api.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); -var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(text, voice, defaultVoiceSettings); +var request = new TextToSpeechRequest(voice, text); +var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request); audioSource.PlayOneShot(voiceClip.AudioClip); ``` @@ -284,18 +285,14 @@ Stream text to speech. var api = new ElevenLabsClient(); var text = "The quick brown fox jumps over the lazy dog."; var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); -var partialClips = new Queue(); -var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync( - text, - voice, - partialClip => - { - // Note: Best to queue them and play them in update loop! - // See TextToSpeech sample demo for details - partialClips.Enqueue(partialClip); - }); -// The full completed clip: -audioSource.clip = voiceClip.AudioClip; +var partialClips = new Queue(); +var request = new TextToSpeechRequest(voice, message, model: Model.EnglishTurboV2, outputFormat: OutputFormat.PCM_44100); +var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(request, partialClip => +{ + // Note: check demo scene for best practices + // on how to handle playback with OnAudioFilterRead + partialClips.Enqueue(partialClip); +}); ``` ### [Voices](https://docs.elevenlabs.io/api-reference/voices)