ElevenLabs-DotNet 3.5.0

- Added TextToSpeechRequest.ctr overload - Added seed property - Added applyTextNormalization property - Removed deprecated optimizeStreamingLatency property - Updated Unit Tests
RageAgainstThePixel · Mar 2, 2025 · dfc4344 · dfc4344
1 parent f216524
commit dfc4344
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 24 deletions.
diff --git a/ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs b/ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs
@@ -64,7 +64,38 @@ public async Task Test_03_TextToSpeech_Transcription()
         }
 
         [Test]
-        public async Task Test_05_LanguageEnforced_TextToSpeech()
+        public async Task Test_04_StreamTextToSpeech_Transcription()
+        {
+            Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
+            var voice = Voices.Voice.Adam;
+            Assert.NotNull(voice);
+            voice.Settings ??= await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
+            var partialClips = new Queue<VoiceClip>();
+            var characters = new Queue<TimestampedTranscriptCharacter>();
+            Console.WriteLine("| Character | Start Time | End Time |");
+            Console.WriteLine("| --------- | ---------- | -------- |");
+            var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000, withTimestamps: true);
+            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
+            {
+                Assert.IsNotNull(partialClip);
+                partialClips.Enqueue(partialClip);
+                await Task.CompletedTask;
+                foreach (var character in partialClip.TimestampedTranscriptCharacters)
+                {
+                    characters.Enqueue(character);
+                    Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |");
+                }
+            });
+            Assert.NotNull(partialClips);
+            Assert.NotNull(partialClips);
+            Assert.IsNotEmpty(partialClips);
+            Assert.NotNull(voiceClip);
+            Console.WriteLine(voiceClip.Id);
+            Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters);
+        }
+
+        [Test]
+        public async Task Test_05_01_LanguageEnforced_TextToSpeech()
         {
             Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
             var voice = Voices.Voice.Adam;
@@ -93,7 +124,7 @@ public async Task Test_05_LanguageEnforced_TextToSpeech()
         }
 
         [Test]
-        public async Task Test_TurboV2_5_LanguageEnforced_TextToSpeech()
+        public async Task Test_05_02_TurboV2_5_LanguageEnforced_TextToSpeech()
         {
             Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
             var voice = Voices.Voice.Adam;

diff --git a/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj b/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj
@@ -25,8 +25,13 @@ All copyrights, trademarks, logos, and assets are the property of their respecti
     <SignAssembly>false</SignAssembly>
     <IncludeSymbols>true</IncludeSymbols>
     <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
-    <Version>3.4.2</Version>
+    <Version>3.5.0</Version>
     <PackageReleaseNotes>
+Version 3.5.0
+- Added TextToSpeechRequest.ctr overload
+  - Added seed property
+  - Added applyTextNormalization property
+  - Removed deprecated optimizeStreamingLatency property
 Version 3.4.2
 - Added flash models
 - Added stream input support to dubbing endpoint

diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs
@@ -54,10 +54,12 @@ public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Func
                 { OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
             };
 
+#pragma warning disable CS0618 // Type or member is obsolete
             if (request.OptimizeStreamingLatency.HasValue)
             {
                 parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
             }
+#pragma warning restore CS0618 // Type or member is obsolete
 
             var endpoint = $"/{request.Voice.Id}";
 

diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs
@@ -10,12 +10,32 @@ namespace ElevenLabs.TextToSpeech
 {
     public sealed class TextToSpeechRequest
     {
-        [Obsolete]
+        [Obsolete("use new .ctr overload")]
         public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings)
             : this(null, text, voiceSettings: voiceSettings, model: model)
         {
         }
 
+        [Obsolete("use new .ctr overload")]
+        public TextToSpeechRequest(
+            Voice voice,
+            string text,
+            Encoding encoding,
+            VoiceSettings voiceSettings,
+            OutputFormat outputFormat,
+            int? optimizeStreamingLatency,
+            Model model = null,
+            string previousText = null,
+            string nextText = null,
+            string[] previousRequestIds = null,
+            string[] nextRequestIds = null,
+            string languageCode = null,
+            bool withTimestamps = false)
+          : this(voice, text, encoding, voiceSettings, outputFormat, model, previousText, nextText, previousRequestIds, nextRequestIds, languageCode, withTimestamps)
+        {
+            OptimizeStreamingLatency = optimizeStreamingLatency;
+        }
+
         /// <summary>
         /// Constructor.
         /// </summary>
@@ -36,40 +56,70 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings
         /// Output format of the generated audio.<br/>
         /// Defaults to <see cref="OutputFormat.MP3_44100_128"/>
         /// </param>
-        /// <param name="optimizeStreamingLatency">
-        /// Optional, You can turn on latency optimizations at some cost of quality.
-        /// The best possible final latency varies by model.<br/>
-        /// Possible values:<br/>
-        /// 0 - default mode (no latency optimizations)<br/>
-        /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)<br/>
-        /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)<br/>
-        /// 3 - max latency optimizations<br/>
-        /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings
-        /// (best latency, but can mispronounce e.g. numbers and dates).
+        /// <param name="previousText">
+        /// The text that came before the text of the current request.
+        /// Can be used to improve the speech’s continuity when concatenating together multiple generations or
+        /// to influence the speech’s continuity in the current generation.
+        /// </param>
+        /// <param name="nextText">
+        /// The text that comes after the text of the current request.
+        /// Can be used to improve the speech’s continuity when concatenating together multiple generations or
+        /// to influence the speech’s continuity in the current generation.
+        /// </param>
+        /// <param name="previousRequestIds">
+        /// A list of request_id of the samples that were generated before this generation.
+        /// Can be used to improve the speech’s continuity when splitting up a large task into multiple requests.
+        /// The results will be best when the same model is used across the generations. In case both previous_text and previous_request_ids is send,
+        /// previous_text will be ignored. A maximum of 3 request_ids can be send.
+        /// </param>
+        /// <param name="nextRequestIds">
+        /// A list of request_id of the samples that come after this generation.
+        /// next_request_ids is especially useful for maintaining the speech’s continuity when regenerating a sample that has had some audio quality issues.
+        /// For example, if you have generated 3 speech clips, and you want to improve clip 2,
+        /// passing the request id of clip 3 as a next_request_id (and that of clip 1 as a previous_request_id)
+        /// will help maintain natural flow in the combined speech.
+        /// The results will be best when the same model is used across the generations.
+        /// In case both next_text and next_request_ids is send, next_text will be ignored.
+        /// A maximum of 3 request_ids can be send.
         /// </param>
-        /// <param name="previousText"></param>
-        /// <param name="nextText"></param>
-        /// <param name="previousRequestIds"></param>
-        /// <param name="nextRequestIds"></param>
         /// <param name="languageCode">
         /// Optional, Language code (ISO 639-1) used to enforce a language for the model. Currently only <see cref="Model.TurboV2_5"/> supports language enforcement.
         /// For other models, an error will be returned if language code is provided.
         /// </param>
-        /// <param name="withTimestamps"></param>
+        /// <param name="cacheFormat">
+        /// The audio format to save the audio in.
+        /// Defaults to <see cref="CacheFormat.Wav"/>
+        /// </param>
+        /// <param name="withTimestamps">
+        /// Generate speech from text with precise character-level timing information for audio-text synchronization.
+        /// </param>
+        /// <param name="seed">
+        /// If specified, our system will make a best effort to sample deterministically,
+        /// such that repeated requests with the same seed and parameters should return the same result.
+        /// Determinism is not guaranteed. Must be integer between 0 and 4294967295.
+        /// </param>
+        /// <param name="applyTextNormalization">
+        /// This parameter controls text normalization with three modes: ‘auto’ (null), ‘on’ (true), and ‘off’ (false).
+        /// When set to ‘null’, the system will automatically decide whether to apply text normalization (e.g., spelling out numbers).
+        /// With ‘true’, text normalization will always be applied,
+        /// while with ‘false’, it will be skipped.
+        /// Cannot be turned on for ‘eleven_turbo_v2_5’ model.
+        /// </param>
         public TextToSpeechRequest(
             Voice voice,
             string text,
             Encoding encoding = null,
             VoiceSettings voiceSettings = null,
             OutputFormat outputFormat = OutputFormat.MP3_44100_128,
-            int? optimizeStreamingLatency = null,
             Model model = null,
             string previousText = null,
             string nextText = null,
             string[] previousRequestIds = null,
             string[] nextRequestIds = null,
             string languageCode = null,
-            bool withTimestamps = false)
+            bool withTimestamps = false,
+            int? seed = null,
+            bool? applyTextNormalization = null)
         {
             if (string.IsNullOrWhiteSpace(text))
             {
@@ -89,10 +139,9 @@ public TextToSpeechRequest(
 
             Text = text;
             Model = model ?? Models.Model.FlashV2;
-            Voice = voice;
+            Voice = string.IsNullOrWhiteSpace(voice) ? Voice.Adam : voice;
             VoiceSettings = voiceSettings ?? voice.Settings;
             OutputFormat = outputFormat;
-            OptimizeStreamingLatency = optimizeStreamingLatency;
             PreviousText = previousText;
             NextText = nextText;
             if (previousRequestIds?.Length > 3)
@@ -107,6 +156,12 @@ public TextToSpeechRequest(
             NextRequestIds = nextRequestIds;
             LanguageCode = languageCode;
             WithTimestamps = withTimestamps;
+            Seed = seed;
+
+            if (applyTextNormalization.HasValue)
+            {
+                ApplyTextNormalization = applyTextNormalization.Value ? "on" : "off";
+            }
         }
 
         [JsonPropertyName("text")]
@@ -129,6 +184,7 @@ public TextToSpeechRequest(
         public OutputFormat OutputFormat { get; }
 
         [JsonIgnore]
+        [Obsolete("Deprecated")]
         public int? OptimizeStreamingLatency { get; }
 
         [JsonPropertyName("next_text")]
@@ -146,11 +202,17 @@ public TextToSpeechRequest(
         [JsonPropertyName("next_request_ids")]
         public string[] NextRequestIds { get; }
 
-
         [JsonPropertyName("language_code")]
         public string LanguageCode { get; }
 
         [JsonIgnore]
         public bool WithTimestamps { get; }
+
+        [JsonPropertyName("seed")]
+        public int? Seed { get; }
+
+        [JsonPropertyName("apply_text_normalization")]
+        [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
+        public string ApplyTextNormalization { get; }
     }
 }