From a1f61248db44701510d36c648f2f6a15b644dcfd Mon Sep 17 00:00:00 2001 From: ivaaan Date: Thu, 25 Sep 2025 15:05:59 -0700 Subject: [PATCH 1/9] add Examples 1 and 2 from TS quickstart --- tts/tts-dotnet-quickstart/Program.cs | 258 +++++++++++------- .../tts-csharp-quickstart.csproj | 1 + 2 files changed, 155 insertions(+), 104 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index 9341b761..2fbaac6a 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -2,129 +2,79 @@ using System.Diagnostics; using System.IO; using System.Linq; -using System.Runtime.InteropServices; using System.Threading.Tasks; -using System.Collections.Concurrent; using System.Threading; using Hume; using Hume.Tts; +using System.Collections.Generic; namespace TtsCsharpQuickstart; class Program { - static async Task Main(string[] args) + private static string? _apiKey; + private static HumeClient? _client; + private static string? _outputDir; + + static async Task RunExamplesAsync() { Console.WriteLine("Starting..."); - - var apiKey = Environment.GetEnvironmentVariable("HUME_API_KEY"); - if (string.IsNullOrEmpty(apiKey)) + + _apiKey = Environment.GetEnvironmentVariable("HUME_API_KEY"); + if (string.IsNullOrEmpty(_apiKey)) { throw new InvalidOperationException("HUME_API_KEY not found in environment variables."); } - var client = new HumeClient(apiKey); - + _client = new HumeClient(_apiKey); + // Create an output directory in the temporary folder - var timestamp = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); - var outputDir = Path.Combine(Path.GetTempPath(), $"hume-audio-{timestamp}"); - Directory.CreateDirectory(outputDir); - - Console.WriteLine($"Results will be written to {outputDir}"); - - // Synthesizing speech with a new voice - var speech1 = await client.Tts.SynthesizeJsonAsync - ( - new PostedTts - { - Utterances = new List - { - new PostedUtterance - { - Description = "A refined, British aristocrat", - Text = "Take an arrow from the quiver." - } - } - } - ); + _outputDir = Path.Combine(Path.GetTempPath(), "hume-audio"); + Directory.CreateDirectory(_outputDir); - await WriteResultToFile(speech1.Generations.First().Audio, "speech1_0", outputDir); + Console.WriteLine($"Results will be written to {_outputDir}"); - var name = $"aristocrat-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}"; - - // Naming the voice and saving it to your voice library - // for later use - var generationId = speech1.Generations.First().GenerationId; - await client.Tts.Voices.CreateAsync(new PostedVoice - { - Name = name, - GenerationId = generationId - }); + await Example1Async(); + await Example2Async(); + // await Example3Async(); - // Continuing previously-generated speech - var speech2 = await client.Tts.SynthesizeJsonAsync(new PostedTts - { - Utterances = new List - { - new PostedUtterance - { - // Using a voice from your voice library - Voice = new PostedUtteranceVoiceWithName { Name = name }, - Text = "Now take a bow." - } - }, - // Providing previous context to maintain consistency. - // This should cause "bow" to rhyme with "toe" and not "cow". - Context = new PostedContextWithGenerationId { GenerationId = generationId }, - NumGenerations = 2 - } - ); + Console.WriteLine("Done"); + } - await WriteResultToFile(speech2.Generations.First().Audio, "speech2_0", outputDir); - await WriteResultToFile(speech2.Generations.Skip(1).First().Audio, "speech2_1", outputDir); + static async Task Main(string[] args) + { + await RunExamplesAsync(); + } - // Acting instructions: modulating the speech from a previously-generated voice - var speech3 = await client.Tts.SynthesizeJsonAsync(new PostedTts - { - Utterances = new List - { - new PostedUtterance - { - Voice = new PostedUtteranceVoiceWithName { Name = name }, - Description = "Murmured softly, with a heavy dose of sarcasm and contempt", - Text = "Does he even know how to use that thing?" - } - }, - Context = new PostedContextWithGenerationId - { - GenerationId = speech2.Generations.First().GenerationId - }, - NumGenerations = 1 - } - ); + /** Example 1: Using a pre-existing voice. + * + * Use this method if you want to synthesize speech with a high-quality voice from + * Hume's Voice Library, or specify `provider: 'CUSTOM_VOICE'` to use a voice that + * you created previously via the Hume Platform or the API. + * */ + static async Task Example1Async() + { + Console.WriteLine("Example 1: Synthesizing audio using a pre-existing voice..."); - await WriteResultToFile(speech3.Generations.First().Audio, "speech3_0", outputDir); + var voice = new PostedUtteranceVoiceWithName + { + Name = "Ava Song", + Provider = new VoiceProvider(Hume.Tts.VoiceProvider.Values.HumeAi) + }; - // Streaming example with real-time audio playback - Console.WriteLine("Streaming audio in real-time..."); - var voice = new PostedUtteranceVoiceWithName { Name = name }; - - using var streamingPlayer = GetStreamingAudioPlayer(); + using var streamingPlayer = StartAudioPlayer(); await streamingPlayer.StartStreamingAsync(); - - await foreach (var snippet in client.Tts.SynthesizeJsonStreamingAsync(new PostedTts + + await foreach (var snippet in _client!.Tts.SynthesizeJsonStreamingAsync(new PostedTts { - Context = new PostedContextWithGenerationId - { - GenerationId = speech3.Generations.First().GenerationId - }, Utterances = new List { - new PostedUtterance { Text = "He's drawn the bow...", Voice = voice }, - new PostedUtterance { Text = "he's fired the arrow...", Voice = voice }, - new PostedUtterance { Text = "I can't believe it! A perfect bullseye!", Voice = voice } + new PostedUtterance { Text = "Dogs became domesticated between 23,000 and 30,000 years ago.", Voice = voice }, }, Format = new Format(new Format.Wav()), + // With `stripHeaders: true`, only the first audio chunk will contain + // headers in container formats (wav, mp3). This allows you to start a + // single audio player and stream all audio chunks to it without artifacts. StripHeaders = true, })) { @@ -132,14 +82,111 @@ await client.Tts.Voices.CreateAsync(new PostedVoice } await streamingPlayer.StopStreamingAsync(); + Console.WriteLine("Done!"); + } + + /** Example 2: Voice Design. + * + * This method demonstrates how you can create a custom voice via the API. + * First, synthesize speech by specifying a `description` prompt and characteristic + * sample text. Specify the generation_id of the resulting audio in a subsequent + * call to create a voice. Then, future calls to tts endpoints can specify the + * voice by name or generation_id. + */ + static async Task Example2Async() + { + Console.WriteLine("Example 2: Voice Design - Creating a custom voice..."); - Console.WriteLine("Done"); + var result1 = await _client!.Tts.SynthesizeJsonAsync(new PostedTts + { + Utterances = new List + { + new PostedUtterance + { + Description = "Crisp, upper-class British accent with impeccably articulated consonants and perfectly placed vowels. Authoritative and theatrical, as if giving a lecture.", + Text = "The science of speech. That's my profession; also my hobby. Happy is the man who can make a living by his hobby!" + } + }, + NumGenerations = 2, + StripHeaders = true, + }); + + Console.WriteLine("Example 2: Synthesizing voice options for voice creation..."); + using var audioPlayer = StartAudioPlayer(); + await audioPlayer.StartStreamingAsync(); + + int sampleNumber = 1; + var generationsList = result1.Generations.ToList(); + foreach (var generation in generationsList) + { + await audioPlayer.SendAudioAsync(Convert.FromBase64String(generation.Audio)); + Console.WriteLine($"Playing option {sampleNumber}..."); + sampleNumber++; + } + await audioPlayer.StopStreamingAsync(); + + // Prompt user to select which voice they prefer + Console.WriteLine("\nWhich voice did you prefer?"); + Console.WriteLine($"1. First voice (generation ID: {generationsList[0].GenerationId})"); + Console.WriteLine($"2. Second voice (generation ID: {generationsList[1].GenerationId})"); + + string? userChoice; + int selectedIndex; + do + { + Console.Write("Enter your choice (1 or 2): "); + userChoice = Console.ReadLine(); + } while (!int.TryParse(userChoice, out selectedIndex) || (selectedIndex != 1 && selectedIndex != 2)); + + var selectedGenerationId = generationsList[selectedIndex - 1].GenerationId; + Console.WriteLine($"Selected voice option {selectedIndex} (generation ID: {selectedGenerationId})"); + + // Save the selected voice + var voiceName = $"higgins-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}"; + await _client!.Tts.Voices.CreateAsync(new PostedVoice + { + Name = voiceName, + GenerationId = selectedGenerationId, + }); + + Console.WriteLine($"Created voice: {voiceName}"); + + Console.WriteLine($"Continuing speech with the selected voice: {voiceName}"); + + using var streamingPlayer2 = StartAudioPlayer(); + await streamingPlayer2.StartStreamingAsync(); + + var stream = (System.Collections.Generic.IAsyncEnumerable)_client!.Tts.SynthesizeJsonStreamingAsync(new PostedTts + { + Utterances = new List + { + new PostedUtterance + { + Voice = new PostedUtteranceVoiceWithName { Name = voiceName }, + Text = "YOU can spot an Irishman or a Yorkshireman by his brogue. I can place any man within six miles. I can place him within two miles in London. Sometimes within two streets.", + Description = "Bragging about his abilities" + } + }, + Context = new PostedContextWithGenerationId + { + GenerationId = selectedGenerationId + }, + StripHeaders = true, + }); + + await foreach (var snippet in stream) + { + await streamingPlayer2.SendAudioAsync(Convert.FromBase64String(snippet.Audio)); + } + await streamingPlayer2.StopStreamingAsync(); + Console.WriteLine("Done!"); } // Real-time streaming audio player using pipe-based approach public class StreamingAudioPlayer : IDisposable { private Process? _audioProcess; + public Stream? StandardInput { get; private set; } private bool _isStreaming = false; public Task StartStreamingAsync() @@ -153,7 +200,7 @@ public Task StartStreamingAsync() public Task SendAudioAsync(byte[] audioBytes) { if (!_isStreaming || _audioProcess?.HasExited != false) return Task.CompletedTask; - + try { _audioProcess?.StandardInput.BaseStream.Write(audioBytes, 0, audioBytes.Length); @@ -163,14 +210,14 @@ public Task SendAudioAsync(byte[] audioBytes) { Console.WriteLine($"Error sending audio chunk: {ex.Message}"); } - + return Task.CompletedTask; } public async Task StopStreamingAsync() { _isStreaming = false; - + try { if (_audioProcess != null && !_audioProcess.HasExited) @@ -183,7 +230,7 @@ public async Task StopStreamingAsync() { Console.WriteLine($"Error stopping audio process: {ex.Message}"); } - + Console.WriteLine("Streaming audio player stopped."); } @@ -201,15 +248,18 @@ private void StartAudioProcess() RedirectStandardError = true, RedirectStandardOutput = true }; - + _audioProcess = Process.Start(startInfo); - + if (_audioProcess == null) { throw new InvalidOperationException("Failed to start ffplay process"); } - - _audioProcess.ErrorDataReceived += (sender, e) => { + + StandardInput = _audioProcess.StandardInput.BaseStream; + + _audioProcess.ErrorDataReceived += (sender, e) => + { if (!string.IsNullOrEmpty(e.Data)) Console.WriteLine($"ffplay: {e.Data}"); }; @@ -236,7 +286,7 @@ public void Dispose() } } - private static StreamingAudioPlayer GetStreamingAudioPlayer() + private static StreamingAudioPlayer StartAudioPlayer() { return new StreamingAudioPlayer(); } diff --git a/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj b/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj index 935fe46e..c10cd989 100644 --- a/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj +++ b/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj @@ -6,6 +6,7 @@ TtsCsharpQuickstart enable enable + latest From 28d6ab9fde25079fee598eeb14d35252911dd855 Mon Sep 17 00:00:00 2001 From: ivaaan Date: Thu, 25 Sep 2025 15:41:52 -0700 Subject: [PATCH 2/9] Example 3 not working yet --- tts/tts-dotnet-quickstart/Program.cs | 46 +++- .../StreamingTtsService.cs | 214 ++++++++++++++++++ 2 files changed, 257 insertions(+), 3 deletions(-) create mode 100644 tts/tts-dotnet-quickstart/StreamingTtsService.cs diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index 2fbaac6a..e9730177 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -7,6 +7,7 @@ using Hume; using Hume.Tts; using System.Collections.Generic; +using TtsCsharpQuickstart; namespace TtsCsharpQuickstart; @@ -34,9 +35,9 @@ static async Task RunExamplesAsync() Console.WriteLine($"Results will be written to {_outputDir}"); - await Example1Async(); - await Example2Async(); - // await Example3Async(); + // await Example1Async(); + // await Example2Async(); + await Example3Async(); Console.WriteLine("Done"); } @@ -182,6 +183,45 @@ static async Task Example2Async() Console.WriteLine("Done!"); } + static async Task Example3Async() + { + Console.WriteLine("Example 3: Bidirectional streaming..."); + + using var streamingTtsClient = new StreamingTtsClient(_apiKey!); + await streamingTtsClient.ConnectAsync(); + + using var audioPlayer = StartAudioPlayer(); + using var silenceFiller = new SilenceFiller(audioPlayer.StandardInput!); + silenceFiller.Start(); + + var sendInputTask = Task.Run(async () => + { + await streamingTtsClient.SendAsync(new { Utterances = new List { new PostedUtterance { Text = "Hello world." } } }); + await streamingTtsClient.SendFlushAsync(); + Console.WriteLine("Waiting 8 seconds..."); + await Task.Delay(8000); + await streamingTtsClient.SendAsync(new { Utterances = new List { new PostedUtterance { Text = "Goodbye, world." } } }); + await streamingTtsClient.SendFlushAsync(); + await streamingTtsClient.SendCloseAsync(); + }); + + var handleMessagesTask = Task.Run(async () => + { + Console.WriteLine("Playing audio: Example 3 - Bidirectional streaming"); + await foreach (var chunk in streamingTtsClient.ReceiveAudioChunksAsync()) + { + var buf = Convert.FromBase64String(chunk.Audio); + silenceFiller.WriteAudio(buf); + } + await silenceFiller.EndStreamAsync(); + await audioPlayer.StopStreamingAsync(); + }); + + await Task.WhenAll(sendInputTask, handleMessagesTask); + + Console.WriteLine("Done!"); + } + // Real-time streaming audio player using pipe-based approach public class StreamingAudioPlayer : IDisposable { diff --git a/tts/tts-dotnet-quickstart/StreamingTtsService.cs b/tts/tts-dotnet-quickstart/StreamingTtsService.cs new file mode 100644 index 00000000..3d09f92f --- /dev/null +++ b/tts/tts-dotnet-quickstart/StreamingTtsService.cs @@ -0,0 +1,214 @@ +using System; +using System.Collections.Generic; +using System.Collections.Concurrent; +using System.IO; +using System.Linq; +using System.Net.WebSockets; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Hume.Tts; + +namespace TtsCsharpQuickstart +{ + public class Queue + { + private readonly List _pushed = new List(); + private TaskCompletionSource? _waiting = null; + private bool _ended = false; + + public void Push(T x) + { + if (_ended) return; + if (_waiting != null) + { + _waiting.SetResult(x); + _waiting = null; + } + else _pushed.Add(x); + } + + public void End() + { + if (_ended) return; + _ended = true; + if (_waiting != null) { _waiting.SetResult(default); _waiting = null; } + } + + public async IAsyncEnumerable GetAsyncEnumerable() + { + while (true) + { + if (_pushed.Any()) + { + var item = _pushed[0]; + _pushed.RemoveAt(0); + yield return item; + } + else + { + _waiting = new TaskCompletionSource(); + var x = await _waiting.Task; + if (x == null) break; + if (x is T concreteX) yield return concreteX; + else throw new InvalidOperationException("Received null from queue when a non-null value was expected."); + } + } + } + } + + public class StreamingTtsClient : IDisposable + { + private readonly ClientWebSocket _webSocket; + private readonly string _apiKey; + private readonly Uri _websocketUri; + private readonly Queue _queue = new Queue(); + private CancellationTokenSource _cts = new CancellationTokenSource(); + + public StreamingTtsClient(string apiKey) + { + _apiKey = apiKey; + _webSocket = new ClientWebSocket(); + _websocketUri = new Uri($"wss://api.hume.ai/v0/tts/stream/input?api_key={apiKey}&no_binary=true&instant_mode=true&strip_headers=true&format_type=pcm"); + } + + public async Task ConnectAsync() + { + try + { + await _webSocket.ConnectAsync(_websocketUri, _cts.Token); + Console.WriteLine("WebSocket connected."); + } + catch (WebSocketException ex) + { + Console.WriteLine($"WebSocket connection error: {ex.Message}"); + throw; + } + + _ = Task.Run(async () => + { + var buffer = new byte[8192]; + try + { + while (_webSocket.State == WebSocketState.Open) + { + var result = await _webSocket.ReceiveAsync(new ArraySegment(buffer), _cts.Token); + + if (result.MessageType == WebSocketMessageType.Close) + { + await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Server closed", _cts.Token); + _queue.End(); + break; + } + + var json = System.Text.Encoding.UTF8.GetString(buffer, 0, result.Count); + _queue.Push(json); + } + } + catch (WebSocketException ex) + { + Console.WriteLine($"WebSocket error in Receive loop: {ex.Message}"); + _queue.End(); + } + catch (OperationCanceledException) + { + _queue.End(); + } + }); + } + + public async Task SendAsync(object message) + { + if (_webSocket.State != WebSocketState.Open) throw new InvalidOperationException("WebSocket not connected."); + var jsonMessage = JsonSerializer.Serialize(message); + var bytes = System.Text.Encoding.UTF8.GetBytes(jsonMessage); + await _webSocket.SendAsync(new ArraySegment(bytes), WebSocketMessageType.Text, true, _cts.Token); + } + + public async Task SendFlushAsync() + { + if (_webSocket.State != WebSocketState.Open) throw new InvalidOperationException("WebSocket not connected."); + await SendAsync(new { flush = true }); + } + + public async Task SendCloseAsync() + { + if (_webSocket.State != WebSocketState.Open) throw new InvalidOperationException("WebSocket not connected."); + await SendAsync(new { close = true }); + } + + public async IAsyncEnumerable ReceiveAudioChunksAsync() + { + await foreach (var item in _queue.GetAsyncEnumerable()) + { + using (JsonDocument doc = JsonDocument.Parse(item)) + { + if (doc.RootElement.TryGetProperty("audio", out JsonElement audioElement)) + { + // It's an audio chunk, deserialize and yield + var chunk = JsonSerializer.Deserialize(item)!; + yield return chunk; + } + } + } + } + + public void Dispose() + { + _cts.Cancel(); + _webSocket.Dispose(); + _cts.Dispose(); + } + } + + public class SilenceFiller : IDisposable + { + private readonly BlockingCollection _audioBuffer = new BlockingCollection(); + private readonly CancellationTokenSource _cts = new CancellationTokenSource(); + private Task? _playbackTask; + private readonly Stream _outputStream; + + public SilenceFiller(Stream outputStream) + { + _outputStream = outputStream; + } + + public void WriteAudio(byte[] audioBytes) + { + _audioBuffer.Add(audioBytes); + } + + public void Start() + { + _playbackTask = Task.Run(async () => + { + try + { + foreach (var audio in _audioBuffer.GetConsumingEnumerable(_cts.Token)) + { + await _outputStream.WriteAsync(audio, _cts.Token); + await _outputStream.FlushAsync(_cts.Token); + } + } + catch (OperationCanceledException) + { + // Expected when _cts.Cancel() is called + } + }); + } + + public async Task EndStreamAsync() + { + _audioBuffer.CompleteAdding(); + if (_playbackTask != null) await _playbackTask; + } + + public void Dispose() + { + _cts.Cancel(); + _audioBuffer.Dispose(); + _cts.Dispose(); + } + } +} From b4c2e70dc7169d729f468fb1ebeabeab73d2a260 Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 17:02:28 -0700 Subject: [PATCH 3/9] upd Example 3 --- tts/tts-dotnet-quickstart/Program.cs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index e9730177..e562077a 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -72,7 +72,6 @@ static async Task Example1Async() { new PostedUtterance { Text = "Dogs became domesticated between 23,000 and 30,000 years ago.", Voice = voice }, }, - Format = new Format(new Format.Wav()), // With `stripHeaders: true`, only the first audio chunk will contain // headers in container formats (wav, mp3). This allows you to start a // single audio player and stream all audio chunks to it without artifacts. @@ -85,7 +84,7 @@ static async Task Example1Async() await streamingPlayer.StopStreamingAsync(); Console.WriteLine("Done!"); } - + /** Example 2: Voice Design. * * This method demonstrates how you can create a custom voice via the API. @@ -191,16 +190,16 @@ static async Task Example3Async() await streamingTtsClient.ConnectAsync(); using var audioPlayer = StartAudioPlayer(); + await audioPlayer.StartStreamingAsync(); using var silenceFiller = new SilenceFiller(audioPlayer.StandardInput!); silenceFiller.Start(); var sendInputTask = Task.Run(async () => { - await streamingTtsClient.SendAsync(new { Utterances = new List { new PostedUtterance { Text = "Hello world." } } }); + await streamingTtsClient.SendAsync(new { text = "Hello" }); + await streamingTtsClient.SendAsync(new { text = " world." }); await streamingTtsClient.SendFlushAsync(); - Console.WriteLine("Waiting 8 seconds..."); - await Task.Delay(8000); - await streamingTtsClient.SendAsync(new { Utterances = new List { new PostedUtterance { Text = "Goodbye, world." } } }); + await streamingTtsClient.SendAsync(new { text = " Goodbye, world." }); await streamingTtsClient.SendFlushAsync(); await streamingTtsClient.SendCloseAsync(); }); @@ -281,7 +280,7 @@ private void StartAudioProcess() var startInfo = new ProcessStartInfo { FileName = "ffplay", - Arguments = "-nodisp -autoexit -infbuf -i -", + Arguments = "-f s16le -ar 48000 -fflags nobuffer -flags low_delay -probesize 32 -analyzeduration 0 -i - -nodisp -autoexit", UseShellExecute = false, CreateNoWindow = true, RedirectStandardInput = true, From 5e687d5c9939ea3642a684b57566cdf5b4068dc0 Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 17:05:59 -0700 Subject: [PATCH 4/9] upd SDK version dep to 0.2.1 (incl. websocket upd from Fern) --- tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj b/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj index c10cd989..ac9e85f5 100644 --- a/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj +++ b/tts/tts-dotnet-quickstart/tts-csharp-quickstart.csproj @@ -10,7 +10,7 @@ - + From 167574f3247b28cbd394b6d3899515fb55b60098 Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 19:36:35 -0700 Subject: [PATCH 5/9] fix Ex. 1, 2 that were broken after version bump --- tts/tts-dotnet-quickstart/Program.cs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index 3ca08806..1271ef19 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -39,9 +39,9 @@ static async Task RunExamplesAsync() Console.WriteLine($"Results will be written to {_outputDir}"); - // await Example1Async(); - // await Example2Async(); - await Example3Async(); + await Example1Async(); + await Example2Async(); + // await Example3Async(); Console.WriteLine("Done"); } @@ -82,9 +82,11 @@ static async Task Example1Async() StripHeaders = true, })) { - if (snippet.IsAudio) + // Handle both TtsOutput and OneOf types for SDK compatibility + var snippetValue = snippet.Value; + if (snippetValue is SnippetAudioChunk audio) { - await streamingPlayer.SendAudioAsync(Convert.FromBase64String(snippet.AsAudio().Audio)); + await streamingPlayer.SendAudioAsync(Convert.FromBase64String(audio.Audio)); } } @@ -163,7 +165,7 @@ static async Task Example2Async() using var streamingPlayer2 = StartAudioPlayer(); await streamingPlayer2.StartStreamingAsync(); - var stream = (System.Collections.Generic.IAsyncEnumerable)_client!.Tts.SynthesizeJsonStreamingAsync(new PostedTts + await foreach (var snippet in _client!.Tts.SynthesizeJsonStreamingAsync(new PostedTts { Utterances = new List { @@ -179,11 +181,14 @@ static async Task Example2Async() GenerationId = selectedGenerationId }, StripHeaders = true, - }); - - await foreach (var snippet in stream) + })) { - await streamingPlayer2.SendAudioAsync(Convert.FromBase64String(snippet.Audio)); + // Handle both TtsOutput and OneOf types for SDK compatibility + var snippetValue = snippet.Value; + if (snippetValue is SnippetAudioChunk audio) + { + await streamingPlayer2.SendAudioAsync(Convert.FromBase64String(audio.Audio)); + } } await streamingPlayer2.StopStreamingAsync(); Console.WriteLine("Done!"); @@ -287,7 +292,7 @@ private void StartAudioProcess() var startInfo = new ProcessStartInfo { FileName = "ffplay", - Arguments = "-f s16le -ar 48000 -fflags nobuffer -flags low_delay -probesize 32 -analyzeduration 0 -i - -nodisp -autoexit", + Arguments = "-nodisp -autoexit -infbuf -i -", UseShellExecute = false, CreateNoWindow = true, RedirectStandardInput = true, From e16f17c1dbb3b9b73ddbbfbc7e8fb507c25c6553 Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 19:45:11 -0700 Subject: [PATCH 6/9] fix Example 3, now all 3 examples work with v0.2.1 --- tts/tts-dotnet-quickstart/Program.cs | 17 ++++++++++++++--- .../StreamingTtsService.cs | 19 ++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index 1271ef19..a7cb2961 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -41,7 +41,7 @@ static async Task RunExamplesAsync() await Example1Async(); await Example2Async(); - // await Example3Async(); + await Example3Async(); Console.WriteLine("Done"); } @@ -201,7 +201,7 @@ static async Task Example3Async() using var streamingTtsClient = new StreamingTtsClient(_apiKey!); await streamingTtsClient.ConnectAsync(); - using var audioPlayer = StartAudioPlayer(); + using var audioPlayer = new StreamingAudioPlayer("pcm"); await audioPlayer.StartStreamingAsync(); using var silenceFiller = new SilenceFiller(audioPlayer.StandardInput!); silenceFiller.Start(); @@ -239,6 +239,12 @@ public class StreamingAudioPlayer : IDisposable private Process? _audioProcess; public Stream? StandardInput { get; private set; } private bool _isStreaming = false; + private readonly string _audioFormat; + + public StreamingAudioPlayer(string audioFormat = "container") + { + _audioFormat = audioFormat; + } public Task StartStreamingAsync() { @@ -289,10 +295,15 @@ private void StartAudioProcess() { try { + // Choose ffplay arguments based on audio format + var arguments = _audioFormat == "pcm" + ? "-f s16le -ar 48000 -fflags nobuffer -flags low_delay -probesize 32 -analyzeduration 0 -i - -nodisp -autoexit" + : "-nodisp -autoexit -infbuf -i -"; + var startInfo = new ProcessStartInfo { FileName = "ffplay", - Arguments = "-nodisp -autoexit -infbuf -i -", + Arguments = arguments, UseShellExecute = false, CreateNoWindow = true, RedirectStandardInput = true, diff --git a/tts/tts-dotnet-quickstart/StreamingTtsService.cs b/tts/tts-dotnet-quickstart/StreamingTtsService.cs index 3d09f92f..401bbd28 100644 --- a/tts/tts-dotnet-quickstart/StreamingTtsService.cs +++ b/tts/tts-dotnet-quickstart/StreamingTtsService.cs @@ -70,7 +70,7 @@ public StreamingTtsClient(string apiKey) { _apiKey = apiKey; _webSocket = new ClientWebSocket(); - _websocketUri = new Uri($"wss://api.hume.ai/v0/tts/stream/input?api_key={apiKey}&no_binary=true&instant_mode=true&strip_headers=true&format_type=pcm"); + _websocketUri = new Uri($"wss://api.hume.ai/v0/tts/stream/input?api_key={apiKey}&no_binary=true&instant_mode=true&format_type=pcm"); } public async Task ConnectAsync() @@ -89,6 +89,7 @@ public async Task ConnectAsync() _ = Task.Run(async () => { var buffer = new byte[8192]; + var messageBuffer = new MemoryStream(); try { while (_webSocket.State == WebSocketState.Open) @@ -102,8 +103,16 @@ public async Task ConnectAsync() break; } - var json = System.Text.Encoding.UTF8.GetString(buffer, 0, result.Count); - _queue.Push(json); + // Write the received chunk to the message buffer + messageBuffer.Write(buffer, 0, result.Count); + + // Check if this is the end of the message + if (result.EndOfMessage) + { + var json = System.Text.Encoding.UTF8.GetString(messageBuffer.ToArray()); + _queue.Push(json); + messageBuffer.SetLength(0); // Reset the buffer for the next message + } } } catch (WebSocketException ex) @@ -115,6 +124,10 @@ public async Task ConnectAsync() { _queue.End(); } + finally + { + messageBuffer.Dispose(); + } }); } From bf6c6ae64fd15949103277bc95fa114b74f081dc Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 20:04:28 -0700 Subject: [PATCH 7/9] fix race condition in audio chunks for Example 3 --- tts/tts-dotnet-quickstart/Program.cs | 19 ++-- .../StreamingTtsService.cs | 102 +++++++++++++++--- 2 files changed, 98 insertions(+), 23 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index a7cb2961..2cada88e 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -40,7 +40,7 @@ static async Task RunExamplesAsync() Console.WriteLine($"Results will be written to {_outputDir}"); await Example1Async(); - await Example2Async(); + // await Example2Async(); await Example3Async(); Console.WriteLine("Done"); @@ -201,7 +201,8 @@ static async Task Example3Async() using var streamingTtsClient = new StreamingTtsClient(_apiKey!); await streamingTtsClient.ConnectAsync(); - using var audioPlayer = new StreamingAudioPlayer("pcm"); + // For bidirectional streaming with PCM, use specific ffplay arguments + using var audioPlayer = new StreamingAudioPlayer(usePcmFormat: true); await audioPlayer.StartStreamingAsync(); using var silenceFiller = new SilenceFiller(audioPlayer.StandardInput!); silenceFiller.Start(); @@ -210,7 +211,11 @@ static async Task Example3Async() { await streamingTtsClient.SendAsync(new { text = "Hello" }); await streamingTtsClient.SendAsync(new { text = " world." }); + // The whitespace ^ is important, otherwise the model would see + // "Helloworld." and not "Hello world." await streamingTtsClient.SendFlushAsync(); + Console.WriteLine("Waiting 8 seconds..."); + await Task.Delay(8000); await streamingTtsClient.SendAsync(new { text = " Goodbye, world." }); await streamingTtsClient.SendFlushAsync(); await streamingTtsClient.SendCloseAsync(); @@ -239,11 +244,11 @@ public class StreamingAudioPlayer : IDisposable private Process? _audioProcess; public Stream? StandardInput { get; private set; } private bool _isStreaming = false; - private readonly string _audioFormat; + private readonly bool _usePcmFormat; - public StreamingAudioPlayer(string audioFormat = "container") + public StreamingAudioPlayer(bool usePcmFormat = false) { - _audioFormat = audioFormat; + _usePcmFormat = usePcmFormat; } public Task StartStreamingAsync() @@ -295,8 +300,8 @@ private void StartAudioProcess() { try { - // Choose ffplay arguments based on audio format - var arguments = _audioFormat == "pcm" + // PCM format requires explicit format specification, WAV/MP3 can auto-detect + var arguments = _usePcmFormat ? "-f s16le -ar 48000 -fflags nobuffer -flags low_delay -probesize 32 -analyzeduration 0 -i - -nodisp -autoexit" : "-nodisp -autoexit -infbuf -i -"; diff --git a/tts/tts-dotnet-quickstart/StreamingTtsService.cs b/tts/tts-dotnet-quickstart/StreamingTtsService.cs index 401bbd28..9520ac3c 100644 --- a/tts/tts-dotnet-quickstart/StreamingTtsService.cs +++ b/tts/tts-dotnet-quickstart/StreamingTtsService.cs @@ -12,48 +12,98 @@ namespace TtsCsharpQuickstart { + /// + /// Thread-safe async queue implementation. + /// + /// Race Condition Fix: This queue is accessed from multiple threads: + /// 1. The WebSocket receive loop (background thread) calls Push() + /// 2. The ReceiveAudioChunksAsync() enumerator (foreground thread) calls GetAsyncEnumerable() + /// + /// Without proper locking, these concurrent accesses caused a race condition where audio chunks + /// were lost (only 1 out of 22 chunks was being yielded). The lock ensures all state (_pushed, + /// _waiting, _ended) is accessed atomically. TaskCompletionSource is completed outside the lock + /// to avoid potential deadlocks. + /// public class Queue { + private readonly object _lock = new object(); private readonly List _pushed = new List(); private TaskCompletionSource? _waiting = null; private bool _ended = false; public void Push(T x) { - if (_ended) return; - if (_waiting != null) + TaskCompletionSource? toComplete = null; + lock (_lock) { - _waiting.SetResult(x); - _waiting = null; + if (_ended) return; + if (_waiting != null) + { + toComplete = _waiting; + _waiting = null; + } + else _pushed.Add(x); } - else _pushed.Add(x); + // Complete outside the lock to avoid potential deadlocks + toComplete?.SetResult(x); } public void End() { - if (_ended) return; - _ended = true; - if (_waiting != null) { _waiting.SetResult(default); _waiting = null; } + TaskCompletionSource? toComplete = null; + lock (_lock) + { + if (_ended) return; + _ended = true; + if (_waiting != null) + { + toComplete = _waiting; + _waiting = null; + } + } + // Complete outside the lock + toComplete?.SetResult(default); } public async IAsyncEnumerable GetAsyncEnumerable() { while (true) { - if (_pushed.Any()) + T? item = default; + bool hasItem = false; + TaskCompletionSource? tcs = null; + + lock (_lock) + { + if (_pushed.Any()) + { + item = _pushed[0]; + _pushed.RemoveAt(0); + hasItem = true; + } + else if (!_ended) + { + _waiting = new TaskCompletionSource(); + tcs = _waiting; + } + } + + if (hasItem) { - var item = _pushed[0]; - _pushed.RemoveAt(0); - yield return item; + yield return item!; } - else + else if (tcs != null) { - _waiting = new TaskCompletionSource(); - var x = await _waiting.Task; + var x = await tcs.Task; if (x == null) break; if (x is T concreteX) yield return concreteX; else throw new InvalidOperationException("Received null from queue when a non-null value was expected."); } + else + { + // Queue ended and no more items + break; + } } } } @@ -70,7 +120,9 @@ public StreamingTtsClient(string apiKey) { _apiKey = apiKey; _webSocket = new ClientWebSocket(); - _websocketUri = new Uri($"wss://api.hume.ai/v0/tts/stream/input?api_key={apiKey}&no_binary=true&instant_mode=true&format_type=pcm"); + // For bidirectional streaming, use PCM format with strip_headers=true + // to ensure continuous streaming without per-chunk headers + _websocketUri = new Uri($"wss://api.hume.ai/v0/tts/stream/input?api_key={apiKey}&no_binary=true&instant_mode=true&strip_headers=true&format_type=pcm"); } public async Task ConnectAsync() @@ -90,14 +142,17 @@ public async Task ConnectAsync() { var buffer = new byte[8192]; var messageBuffer = new MemoryStream(); + int messagesReceived = 0; try { + Console.WriteLine("WebSocket receive loop started"); while (_webSocket.State == WebSocketState.Open) { var result = await _webSocket.ReceiveAsync(new ArraySegment(buffer), _cts.Token); if (result.MessageType == WebSocketMessageType.Close) { + Console.WriteLine("WebSocket close message received"); await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Server closed", _cts.Token); _queue.End(); break; @@ -109,11 +164,14 @@ public async Task ConnectAsync() // Check if this is the end of the message if (result.EndOfMessage) { + messagesReceived++; var json = System.Text.Encoding.UTF8.GetString(messageBuffer.ToArray()); + Console.WriteLine($"WebSocket received message #{messagesReceived}: {json.Substring(0, Math.Min(60, json.Length))}..."); _queue.Push(json); messageBuffer.SetLength(0); // Reset the buffer for the next message } } + Console.WriteLine($"WebSocket receive loop exited normally. State: {_webSocket.State}, Messages received: {messagesReceived}"); } catch (WebSocketException ex) { @@ -122,6 +180,7 @@ public async Task ConnectAsync() } catch (OperationCanceledException) { + Console.WriteLine("WebSocket receive loop cancelled"); _queue.End(); } finally @@ -153,18 +212,29 @@ public async Task SendCloseAsync() public async IAsyncEnumerable ReceiveAudioChunksAsync() { + Console.WriteLine("Starting to receive audio chunks..."); + int messageCount = 0; + int audioChunkCount = 0; await foreach (var item in _queue.GetAsyncEnumerable()) { + messageCount++; using (JsonDocument doc = JsonDocument.Parse(item)) { if (doc.RootElement.TryGetProperty("audio", out JsonElement audioElement)) { // It's an audio chunk, deserialize and yield + audioChunkCount++; + Console.WriteLine($"Yielding audio chunk #{audioChunkCount}"); var chunk = JsonSerializer.Deserialize(item)!; yield return chunk; } + else + { + Console.WriteLine($"Message #{messageCount} is not an audio chunk (might be timestamp or other message type)"); + } } } + Console.WriteLine($"Finished receiving. Total messages: {messageCount}, Audio chunks: {audioChunkCount}"); } public void Dispose() From ac537e362ef2c7d6340aa5e9c6e22755ee315d1a Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 20:11:29 -0700 Subject: [PATCH 8/9] cleanup & comments --- tts/tts-dotnet-quickstart/Program.cs | 102 ++++++++++++------ .../StreamingTtsService.cs | 79 +++++++++++--- 2 files changed, 130 insertions(+), 51 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index 2cada88e..3d677137 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -14,6 +14,11 @@ namespace TtsCsharpQuickstart; class Program { + // Constants + private const string ApiKeyEnvironmentVariable = "HUME_API_KEY"; + private const string DefaultVoiceName = "Ava Song"; + private const int VoiceCreationDelaySeconds = 8; + private static string? _apiKey; private static HumeClient? _client; private static string? _outputDir; @@ -25,10 +30,10 @@ static async Task RunExamplesAsync() Console.WriteLine("Starting..."); - _apiKey = Environment.GetEnvironmentVariable("HUME_API_KEY"); + _apiKey = Environment.GetEnvironmentVariable(ApiKeyEnvironmentVariable); if (string.IsNullOrEmpty(_apiKey)) { - throw new InvalidOperationException("HUME_API_KEY not found in environment variables."); + throw new InvalidOperationException($"{ApiKeyEnvironmentVariable} not found in environment variables."); } _client = new HumeClient(_apiKey); @@ -51,26 +56,27 @@ static async Task Main(string[] args) await RunExamplesAsync(); } - /** Example 1: Using a pre-existing voice. - * - * Use this method if you want to synthesize speech with a high-quality voice from - * Hume's Voice Library, or specify `provider: 'CUSTOM_VOICE'` to use a voice that - * you created previously via the Hume Platform or the API. - * */ + /// + /// Example 1: Using a pre-existing voice. + /// + /// Use this method if you want to synthesize speech with a high-quality voice from + /// Hume's Voice Library, or specify `provider: 'CUSTOM_VOICE'` to use a voice that + /// you created previously via the Hume Platform or the API. + /// static async Task Example1Async() { Console.WriteLine("Example 1: Synthesizing audio using a pre-existing voice..."); var voice = new PostedUtteranceVoiceWithName { - Name = "Ava Song", + Name = DefaultVoiceName, Provider = new VoiceProvider(Hume.Tts.VoiceProvider.Values.HumeAi) }; using var streamingPlayer = StartAudioPlayer(); await streamingPlayer.StartStreamingAsync(); - await foreach (var snippet in _client!.Tts.SynthesizeJsonStreamingAsync(new PostedTts + var ttsRequest = new PostedTts { Utterances = new List { @@ -80,16 +86,9 @@ static async Task Example1Async() // headers in container formats (wav, mp3). This allows you to start a // single audio player and stream all audio chunks to it without artifacts. StripHeaders = true, - })) - { - // Handle both TtsOutput and OneOf types for SDK compatibility - var snippetValue = snippet.Value; - if (snippetValue is SnippetAudioChunk audio) - { - await streamingPlayer.SendAudioAsync(Convert.FromBase64String(audio.Audio)); - } - } + }; + await StreamAudioToPlayerAsync(_client!.Tts.SynthesizeJsonStreamingAsync(ttsRequest), streamingPlayer); await streamingPlayer.StopStreamingAsync(); Console.WriteLine("Done!"); } @@ -165,7 +164,7 @@ static async Task Example2Async() using var streamingPlayer2 = StartAudioPlayer(); await streamingPlayer2.StartStreamingAsync(); - await foreach (var snippet in _client!.Tts.SynthesizeJsonStreamingAsync(new PostedTts + var continuationRequest = new PostedTts { Utterances = new List { @@ -181,19 +180,20 @@ static async Task Example2Async() GenerationId = selectedGenerationId }, StripHeaders = true, - })) - { - // Handle both TtsOutput and OneOf types for SDK compatibility - var snippetValue = snippet.Value; - if (snippetValue is SnippetAudioChunk audio) - { - await streamingPlayer2.SendAudioAsync(Convert.FromBase64String(audio.Audio)); - } - } + }; + + await StreamAudioToPlayerAsync(_client!.Tts.SynthesizeJsonStreamingAsync(continuationRequest), streamingPlayer2); await streamingPlayer2.StopStreamingAsync(); Console.WriteLine("Done!"); } + /// + /// Example 3: Bidirectional streaming. + /// + /// Demonstrates how to use WebSocket-based streaming for real-time text-to-speech. + /// This allows you to send text incrementally and receive audio chunks as they're generated, + /// enabling low-latency conversational experiences. + /// static async Task Example3Async() { Console.WriteLine("Example 3: Bidirectional streaming..."); @@ -207,6 +207,7 @@ static async Task Example3Async() using var silenceFiller = new SilenceFiller(audioPlayer.StandardInput!); silenceFiller.Start(); + // Task 1: Send text input to the TTS service var sendInputTask = Task.Run(async () => { await streamingTtsClient.SendAsync(new { text = "Hello" }); @@ -214,20 +215,23 @@ static async Task Example3Async() // The whitespace ^ is important, otherwise the model would see // "Helloworld." and not "Hello world." await streamingTtsClient.SendFlushAsync(); - Console.WriteLine("Waiting 8 seconds..."); - await Task.Delay(8000); + + // Simulate a delay before continuing the conversation + await Task.Delay(TimeSpan.FromSeconds(VoiceCreationDelaySeconds)); + await streamingTtsClient.SendAsync(new { text = " Goodbye, world." }); await streamingTtsClient.SendFlushAsync(); await streamingTtsClient.SendCloseAsync(); }); + // Task 2: Receive and play audio chunks as they arrive var handleMessagesTask = Task.Run(async () => { Console.WriteLine("Playing audio: Example 3 - Bidirectional streaming"); await foreach (var chunk in streamingTtsClient.ReceiveAudioChunksAsync()) { - var buf = Convert.FromBase64String(chunk.Audio); - silenceFiller.WriteAudio(buf); + var audioBytes = Convert.FromBase64String(chunk.Audio); + silenceFiller.WriteAudio(audioBytes); } await silenceFiller.EndStreamAsync(); await audioPlayer.StopStreamingAsync(); @@ -238,7 +242,30 @@ static async Task Example3Async() Console.WriteLine("Done!"); } - // Real-time streaming audio player using pipe-based approach + /// + /// Helper method to stream audio chunks from a TTS response to an audio player. + /// Handles SDK compatibility by working with both TtsOutput and OneOf types. + /// + private static async Task StreamAudioToPlayerAsync( + IAsyncEnumerable snippetStream, + StreamingAudioPlayer player) + { + await foreach (var snippet in snippetStream) + { + // Handle both TtsOutput and OneOf types for SDK compatibility + // Using dynamic type inspection for compatibility across SDK versions + var snippetValue = (snippet as dynamic)?.Value; + if (snippetValue is SnippetAudioChunk audio) + { + await player.SendAudioAsync(Convert.FromBase64String(audio.Audio)); + } + } + } + + /// + /// Real-time streaming audio player using ffplay. + /// Pipes audio data to ffplay process for immediate playback without writing to disk. + /// public class StreamingAudioPlayer : IDisposable { private Process? _audioProcess; @@ -246,6 +273,13 @@ public class StreamingAudioPlayer : IDisposable private bool _isStreaming = false; private readonly bool _usePcmFormat; + /// + /// Creates a new StreamingAudioPlayer. + /// + /// + /// If true, configures ffplay for raw PCM audio (48kHz, 16-bit signed little-endian). + /// If false, uses auto-detection for container formats like WAV or MP3 (default). + /// public StreamingAudioPlayer(bool usePcmFormat = false) { _usePcmFormat = usePcmFormat; diff --git a/tts/tts-dotnet-quickstart/StreamingTtsService.cs b/tts/tts-dotnet-quickstart/StreamingTtsService.cs index 9520ac3c..ca7004e1 100644 --- a/tts/tts-dotnet-quickstart/StreamingTtsService.cs +++ b/tts/tts-dotnet-quickstart/StreamingTtsService.cs @@ -13,7 +13,7 @@ namespace TtsCsharpQuickstart { /// - /// Thread-safe async queue implementation. + /// Thread-safe async queue implementation for WebSocket message handling. /// /// Race Condition Fix: This queue is accessed from multiple threads: /// 1. The WebSocket receive loop (background thread) calls Push() @@ -24,7 +24,7 @@ namespace TtsCsharpQuickstart /// _waiting, _ended) is accessed atomically. TaskCompletionSource is completed outside the lock /// to avoid potential deadlocks. /// - public class Queue + internal class Queue { private readonly object _lock = new object(); private readonly List _pushed = new List(); @@ -108,21 +108,36 @@ public async IAsyncEnumerable GetAsyncEnumerable() } } + /// + /// WebSocket client for bidirectional streaming TTS. + /// Handles connection management, message sending/receiving, and audio chunk streaming. + /// public class StreamingTtsClient : IDisposable { + private const int WebSocketBufferSize = 8192; + private const string WebSocketEndpoint = "wss://api.hume.ai/v0/tts/stream/input"; + private readonly ClientWebSocket _webSocket; private readonly string _apiKey; private readonly Uri _websocketUri; private readonly Queue _queue = new Queue(); + private readonly bool _enableDebugLogging; private CancellationTokenSource _cts = new CancellationTokenSource(); - public StreamingTtsClient(string apiKey) + /// + /// Creates a new StreamingTtsClient for bidirectional TTS streaming. + /// + /// Your Hume API key + /// Enable verbose logging for debugging (default: false) + public StreamingTtsClient(string apiKey, bool enableDebugLogging = false) { _apiKey = apiKey; + _enableDebugLogging = enableDebugLogging; _webSocket = new ClientWebSocket(); + // For bidirectional streaming, use PCM format with strip_headers=true // to ensure continuous streaming without per-chunk headers - _websocketUri = new Uri($"wss://api.hume.ai/v0/tts/stream/input?api_key={apiKey}&no_binary=true&instant_mode=true&strip_headers=true&format_type=pcm"); + _websocketUri = new Uri($"{WebSocketEndpoint}?api_key={apiKey}&no_binary=true&instant_mode=true&strip_headers=true&format_type=pcm"); } public async Task ConnectAsync() @@ -130,7 +145,7 @@ public async Task ConnectAsync() try { await _webSocket.ConnectAsync(_websocketUri, _cts.Token); - Console.WriteLine("WebSocket connected."); + LogDebug("WebSocket connected."); } catch (WebSocketException ex) { @@ -140,19 +155,19 @@ public async Task ConnectAsync() _ = Task.Run(async () => { - var buffer = new byte[8192]; + var buffer = new byte[WebSocketBufferSize]; var messageBuffer = new MemoryStream(); int messagesReceived = 0; try { - Console.WriteLine("WebSocket receive loop started"); + LogDebug("WebSocket receive loop started"); while (_webSocket.State == WebSocketState.Open) { var result = await _webSocket.ReceiveAsync(new ArraySegment(buffer), _cts.Token); if (result.MessageType == WebSocketMessageType.Close) { - Console.WriteLine("WebSocket close message received"); + LogDebug("WebSocket close message received"); await _webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "Server closed", _cts.Token); _queue.End(); break; @@ -165,22 +180,22 @@ public async Task ConnectAsync() if (result.EndOfMessage) { messagesReceived++; - var json = System.Text.Encoding.UTF8.GetString(messageBuffer.ToArray()); - Console.WriteLine($"WebSocket received message #{messagesReceived}: {json.Substring(0, Math.Min(60, json.Length))}..."); + var json = Encoding.UTF8.GetString(messageBuffer.ToArray()); + LogDebug($"Received message #{messagesReceived}"); _queue.Push(json); messageBuffer.SetLength(0); // Reset the buffer for the next message } } - Console.WriteLine($"WebSocket receive loop exited normally. State: {_webSocket.State}, Messages received: {messagesReceived}"); + LogDebug($"WebSocket receive loop exited. State: {_webSocket.State}, Messages: {messagesReceived}"); } catch (WebSocketException ex) { - Console.WriteLine($"WebSocket error in Receive loop: {ex.Message}"); + Console.WriteLine($"WebSocket error: {ex.Message}"); _queue.End(); } catch (OperationCanceledException) { - Console.WriteLine("WebSocket receive loop cancelled"); + LogDebug("WebSocket receive loop cancelled"); _queue.End(); } finally @@ -212,9 +227,10 @@ public async Task SendCloseAsync() public async IAsyncEnumerable ReceiveAudioChunksAsync() { - Console.WriteLine("Starting to receive audio chunks..."); + LogDebug("Starting to receive audio chunks..."); int messageCount = 0; int audioChunkCount = 0; + await foreach (var item in _queue.GetAsyncEnumerable()) { messageCount++; @@ -224,17 +240,27 @@ public async IAsyncEnumerable ReceiveAudioChunksAsync() { // It's an audio chunk, deserialize and yield audioChunkCount++; - Console.WriteLine($"Yielding audio chunk #{audioChunkCount}"); + LogDebug($"Yielding audio chunk #{audioChunkCount}"); var chunk = JsonSerializer.Deserialize(item)!; yield return chunk; } else { - Console.WriteLine($"Message #{messageCount} is not an audio chunk (might be timestamp or other message type)"); + // Non-audio messages (e.g., timestamps) are silently ignored + LogDebug($"Message #{messageCount} is not an audio chunk"); } } } - Console.WriteLine($"Finished receiving. Total messages: {messageCount}, Audio chunks: {audioChunkCount}"); + + LogDebug($"Finished receiving. Total messages: {messageCount}, Audio chunks: {audioChunkCount}"); + } + + private void LogDebug(string message) + { + if (_enableDebugLogging) + { + Console.WriteLine($"[StreamingTtsClient] {message}"); + } } public void Dispose() @@ -245,6 +271,12 @@ public void Dispose() } } + /// + /// Buffers and streams audio chunks to an output stream in a continuous manner. + /// Prevents audio playback gaps by maintaining a buffer between audio chunk arrival + /// and playback. Useful for real-time streaming scenarios where chunks may arrive + /// with slight delays. + /// public class SilenceFiller : IDisposable { private readonly BlockingCollection _audioBuffer = new BlockingCollection(); @@ -252,16 +284,26 @@ public class SilenceFiller : IDisposable private Task? _playbackTask; private readonly Stream _outputStream; + /// + /// Creates a new SilenceFiller that writes audio to the specified output stream. + /// + /// The stream to write audio data to (typically ffplay's stdin) public SilenceFiller(Stream outputStream) { _outputStream = outputStream; } + /// + /// Adds an audio chunk to the buffer for playback. + /// public void WriteAudio(byte[] audioBytes) { _audioBuffer.Add(audioBytes); } + /// + /// Starts the background task that continuously writes buffered audio to the output stream. + /// public void Start() { _playbackTask = Task.Run(async () => @@ -281,6 +323,9 @@ public void Start() }); } + /// + /// Signals that no more audio will be added and waits for all buffered audio to be written. + /// public async Task EndStreamAsync() { _audioBuffer.CompleteAdding(); From b95cf62c12edb7517034f3091198d14e1e23056b Mon Sep 17 00:00:00 2001 From: ivaaan Date: Fri, 10 Oct 2025 20:17:09 -0700 Subject: [PATCH 9/9] merge AudioPlayer and SilenceFiller --- tts/tts-dotnet-quickstart/Program.cs | 106 ++++++++++++++---- .../StreamingTtsService.cs | 69 ------------ 2 files changed, 86 insertions(+), 89 deletions(-) diff --git a/tts/tts-dotnet-quickstart/Program.cs b/tts/tts-dotnet-quickstart/Program.cs index 3d677137..76f40c87 100644 --- a/tts/tts-dotnet-quickstart/Program.cs +++ b/tts/tts-dotnet-quickstart/Program.cs @@ -1,21 +1,22 @@ using System; +using System.Collections.Concurrent; +using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; -using System.Threading.Tasks; using System.Threading; +using System.Threading.Tasks; +using DotNetEnv; using Hume; using Hume.Tts; -using System.Collections.Generic; using TtsCsharpQuickstart; -using DotNetEnv; namespace TtsCsharpQuickstart; class Program { // Constants - private const string ApiKeyEnvironmentVariable = "HUME_API_KEY"; + private const string HumeApiKey = "HUME_API_KEY"; private const string DefaultVoiceName = "Ava Song"; private const int VoiceCreationDelaySeconds = 8; @@ -30,10 +31,10 @@ static async Task RunExamplesAsync() Console.WriteLine("Starting..."); - _apiKey = Environment.GetEnvironmentVariable(ApiKeyEnvironmentVariable); + _apiKey = Environment.GetEnvironmentVariable(HumeApiKey); if (string.IsNullOrEmpty(_apiKey)) { - throw new InvalidOperationException($"{ApiKeyEnvironmentVariable} not found in environment variables."); + throw new InvalidOperationException($"{HumeApiKey} not found in environment variables."); } _client = new HumeClient(_apiKey); @@ -45,7 +46,7 @@ static async Task RunExamplesAsync() Console.WriteLine($"Results will be written to {_outputDir}"); await Example1Async(); - // await Example2Async(); + await Example2Async(); await Example3Async(); Console.WriteLine("Done"); @@ -201,11 +202,9 @@ static async Task Example3Async() using var streamingTtsClient = new StreamingTtsClient(_apiKey!); await streamingTtsClient.ConnectAsync(); - // For bidirectional streaming with PCM, use specific ffplay arguments - using var audioPlayer = new StreamingAudioPlayer(usePcmFormat: true); + // Use buffered mode for bidirectional streaming to handle irregular chunk arrival timing + using var audioPlayer = new StreamingAudioPlayer(usePcmFormat: true, useBuffering: true); await audioPlayer.StartStreamingAsync(); - using var silenceFiller = new SilenceFiller(audioPlayer.StandardInput!); - silenceFiller.Start(); // Task 1: Send text input to the TTS service var sendInputTask = Task.Run(async () => @@ -231,9 +230,8 @@ static async Task Example3Async() await foreach (var chunk in streamingTtsClient.ReceiveAudioChunksAsync()) { var audioBytes = Convert.FromBase64String(chunk.Audio); - silenceFiller.WriteAudio(audioBytes); + await audioPlayer.SendAudioAsync(audioBytes); } - await silenceFiller.EndStreamAsync(); await audioPlayer.StopStreamingAsync(); }); @@ -265,13 +263,19 @@ private static async Task StreamAudioToPlayerAsync( /// /// Real-time streaming audio player using ffplay. /// Pipes audio data to ffplay process for immediate playback without writing to disk. + /// Supports optional buffering for scenarios with variable chunk arrival timing. /// public class StreamingAudioPlayer : IDisposable { private Process? _audioProcess; - public Stream? StandardInput { get; private set; } private bool _isStreaming = false; private readonly bool _usePcmFormat; + private readonly bool _useBuffering; + + // Buffering support for bidirectional streaming scenarios + private BlockingCollection? _audioBuffer; + private CancellationTokenSource? _bufferCts; + private Task? _bufferTask; /// /// Creates a new StreamingAudioPlayer. @@ -280,27 +284,72 @@ public class StreamingAudioPlayer : IDisposable /// If true, configures ffplay for raw PCM audio (48kHz, 16-bit signed little-endian). /// If false, uses auto-detection for container formats like WAV or MP3 (default). /// - public StreamingAudioPlayer(bool usePcmFormat = false) + /// + /// If true, enables buffered mode where audio chunks are queued and played continuously + /// by a background task. This is useful for bidirectional streaming where chunks may + /// arrive with irregular timing. If false, audio is written directly to ffplay (default). + /// + public StreamingAudioPlayer(bool usePcmFormat = false, bool useBuffering = false) { _usePcmFormat = usePcmFormat; + _useBuffering = useBuffering; + + if (_useBuffering) + { + _audioBuffer = new BlockingCollection(); + _bufferCts = new CancellationTokenSource(); + } } public Task StartStreamingAsync() { _isStreaming = true; StartAudioProcess(); + + // Start buffer draining task if buffering is enabled + if (_useBuffering && _audioBuffer != null && _bufferCts != null && _audioProcess != null) + { + _bufferTask = Task.Run(async () => + { + try + { + foreach (var audioBytes in _audioBuffer.GetConsumingEnumerable(_bufferCts.Token)) + { + if (_audioProcess?.StandardInput?.BaseStream != null) + { + await _audioProcess.StandardInput.BaseStream.WriteAsync(audioBytes, _bufferCts.Token); + await _audioProcess.StandardInput.BaseStream.FlushAsync(_bufferCts.Token); + } + } + } + catch (OperationCanceledException) + { + // Expected when stopping + } + }); + } + Console.WriteLine("Streaming audio player started..."); return Task.CompletedTask; } public Task SendAudioAsync(byte[] audioBytes) { - if (!_isStreaming || _audioProcess?.HasExited != false) return Task.CompletedTask; + if (!_isStreaming) return Task.CompletedTask; try { - _audioProcess?.StandardInput.BaseStream.Write(audioBytes, 0, audioBytes.Length); - _audioProcess?.StandardInput.BaseStream.Flush(); + if (_useBuffering && _audioBuffer != null) + { + // Buffered mode: add to queue for background task to process + _audioBuffer.Add(audioBytes); + } + else if (_audioProcess?.HasExited == false) + { + // Direct mode: write immediately to ffplay + _audioProcess?.StandardInput.BaseStream.Write(audioBytes, 0, audioBytes.Length); + _audioProcess?.StandardInput.BaseStream.Flush(); + } } catch (Exception ex) { @@ -316,6 +365,17 @@ public async Task StopStreamingAsync() try { + // Complete buffered audio if using buffering + if (_useBuffering && _audioBuffer != null) + { + _audioBuffer.CompleteAdding(); + if (_bufferTask != null) + { + await _bufferTask; + } + } + + // Close ffplay process if (_audioProcess != null && !_audioProcess.HasExited) { _audioProcess.StandardInput.Close(); @@ -357,8 +417,6 @@ private void StartAudioProcess() throw new InvalidOperationException("Failed to start ffplay process"); } - StandardInput = _audioProcess.StandardInput.BaseStream; - _audioProcess.ErrorDataReceived += (sender, e) => { if (!string.IsNullOrEmpty(e.Data)) @@ -377,11 +435,19 @@ public void Dispose() { try { + // Cancel buffering operations + _bufferCts?.Cancel(); + + // Kill ffplay process if still running if (_audioProcess != null && !_audioProcess.HasExited) { _audioProcess.Kill(); } + + // Dispose resources _audioProcess?.Dispose(); + _audioBuffer?.Dispose(); + _bufferCts?.Dispose(); } catch { } } diff --git a/tts/tts-dotnet-quickstart/StreamingTtsService.cs b/tts/tts-dotnet-quickstart/StreamingTtsService.cs index ca7004e1..ea4b6e04 100644 --- a/tts/tts-dotnet-quickstart/StreamingTtsService.cs +++ b/tts/tts-dotnet-quickstart/StreamingTtsService.cs @@ -270,73 +270,4 @@ public void Dispose() _cts.Dispose(); } } - - /// - /// Buffers and streams audio chunks to an output stream in a continuous manner. - /// Prevents audio playback gaps by maintaining a buffer between audio chunk arrival - /// and playback. Useful for real-time streaming scenarios where chunks may arrive - /// with slight delays. - /// - public class SilenceFiller : IDisposable - { - private readonly BlockingCollection _audioBuffer = new BlockingCollection(); - private readonly CancellationTokenSource _cts = new CancellationTokenSource(); - private Task? _playbackTask; - private readonly Stream _outputStream; - - /// - /// Creates a new SilenceFiller that writes audio to the specified output stream. - /// - /// The stream to write audio data to (typically ffplay's stdin) - public SilenceFiller(Stream outputStream) - { - _outputStream = outputStream; - } - - /// - /// Adds an audio chunk to the buffer for playback. - /// - public void WriteAudio(byte[] audioBytes) - { - _audioBuffer.Add(audioBytes); - } - - /// - /// Starts the background task that continuously writes buffered audio to the output stream. - /// - public void Start() - { - _playbackTask = Task.Run(async () => - { - try - { - foreach (var audio in _audioBuffer.GetConsumingEnumerable(_cts.Token)) - { - await _outputStream.WriteAsync(audio, _cts.Token); - await _outputStream.FlushAsync(_cts.Token); - } - } - catch (OperationCanceledException) - { - // Expected when _cts.Cancel() is called - } - }); - } - - /// - /// Signals that no more audio will be added and waits for all buffered audio to be written. - /// - public async Task EndStreamAsync() - { - _audioBuffer.CompleteAdding(); - if (_playbackTask != null) await _playbackTask; - } - - public void Dispose() - { - _cts.Cancel(); - _audioBuffer.Dispose(); - _cts.Dispose(); - } - } }