Skip to content

Commit 4cefe2a

Browse files
RogerBarretoSergeyMenshykh
authored andcommitted
.Net: Add Audio Input and Output support for OpenAI chat completions (microsoft#11279)
- Resolve microsoft#10493 --------- Co-authored-by: SergeyMenshykh <[email protected]>
1 parent 1625192 commit 4cefe2a

File tree

8 files changed

+756
-11
lines changed

8 files changed

+756
-11
lines changed

dotnet/Directory.Packages.props

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
<PackageVersion Include="Microsoft.Bcl.AsyncInterfaces" Version="8.0.0" />
5454
<PackageVersion Include="Microsoft.Bcl.Numerics" Version="8.0.0" />
5555
<PackageVersion Include="Microsoft.CodeAnalysis.Common" Version="4.13.0" />
56-
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.11.0" />
56+
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.13.0" />
5757
<PackageVersion Include="Microsoft.Bcl.TimeProvider" Version="8.0.1" />
5858
<PackageVersion Include="Microsoft.Identity.Client" Version="4.67.2" />
5959
<PackageVersion Include="Microsoft.ML.OnnxRuntime" Version="1.21.0" />
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Reflection;
4+
using Microsoft.SemanticKernel;
5+
using Microsoft.SemanticKernel.ChatCompletion;
6+
using Microsoft.SemanticKernel.Connectors.OpenAI;
7+
using OpenAI.Chat;
8+
using Resources;
9+
10+
namespace ChatCompletion;
11+
12+
/// <summary>
13+
/// These examples demonstrate how to use audio input and output with OpenAI Chat Completion
14+
/// </summary>
15+
/// <remarks>
16+
/// Currently, audio input and output is only supported with the following models:
17+
/// <list type="bullet">
18+
/// <item>gpt-4o-audio-preview</item>
19+
/// </list>
20+
/// The sample demonstrates:
21+
/// <list type="bullet">
22+
/// <item>How to send audio input to the model</item>
23+
/// <item>How to receive both text and audio output from the model</item>
24+
/// <item>How to save and process the audio response</item>
25+
/// </list>
26+
/// </remarks>
27+
public class OpenAI_ChatCompletionWithAudio(ITestOutputHelper output) : BaseTest(output)
28+
{
29+
/// <summary>
30+
/// This example demonstrates how to use audio input and receive both text and audio output from the model.
31+
/// </summary>
32+
/// <remarks>
33+
/// This sample shows:
34+
/// <list type="bullet">
35+
/// <item>Loading audio data from a resource file</item>
36+
/// <item>Configuring the chat completion service with audio options</item>
37+
/// <item>Enabling both text and audio response modalities</item>
38+
/// <item>Extracting and saving the audio response to a file</item>
39+
/// <item>Accessing the transcript metadata from the audio response</item>
40+
/// </list>
41+
/// </remarks>
42+
[Fact]
43+
public async Task UsingChatCompletionWithLocalInputAudioAndOutputAudio()
44+
{
45+
Console.WriteLine($"======== Open AI - {nameof(UsingChatCompletionWithLocalInputAudioAndOutputAudio)} ========\n");
46+
47+
var audioBytes = await EmbeddedResource.ReadAllAsync("test_audio.wav");
48+
49+
var kernel = Kernel.CreateBuilder()
50+
.AddOpenAIChatCompletion("gpt-4o-audio-preview", TestConfiguration.OpenAI.ApiKey)
51+
.Build();
52+
53+
var chatCompletionService = kernel.GetRequiredService<IChatCompletionService>();
54+
var settings = new OpenAIPromptExecutionSettings
55+
{
56+
Audio = new ChatAudioOptions(ChatOutputAudioVoice.Shimmer, ChatOutputAudioFormat.Mp3),
57+
Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio
58+
};
59+
60+
var chatHistory = new ChatHistory("You are a friendly assistant.");
61+
62+
chatHistory.AddUserMessage([new AudioContent(audioBytes, "audio/wav")]);
63+
64+
var result = await chatCompletionService.GetChatMessageContentAsync(chatHistory, settings);
65+
66+
// Now we need to get the audio content from the result
67+
var audioReply = result.Items.First(i => i is AudioContent) as AudioContent;
68+
69+
var currentDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location)!;
70+
var audioFile = Path.Combine(currentDirectory, "audio_output.mp3");
71+
if (File.Exists(audioFile))
72+
{
73+
File.Delete(audioFile);
74+
}
75+
File.WriteAllBytes(audioFile, audioReply!.Data!.Value.ToArray());
76+
77+
Console.WriteLine($"Generated audio: {new Uri(audioFile).AbsoluteUri}");
78+
Console.WriteLine($"Transcript: {audioReply.Metadata!["Transcript"]}");
79+
}
80+
81+
/// <summary>
82+
/// This example demonstrates how to use audio input and receive only text output from the model.
83+
/// </summary>
84+
/// <remarks>
85+
/// This sample shows:
86+
/// <list type="bullet">
87+
/// <item>Loading audio data from a resource file</item>
88+
/// <item>Configuring the chat completion service with audio options</item>
89+
/// <item>Setting response modalities to Text only</item>
90+
/// <item>Processing the text response from the model</item>
91+
/// </list>
92+
/// </remarks>
93+
[Fact]
94+
public async Task UsingChatCompletionWithLocalInputAudioAndTextOutput()
95+
{
96+
Console.WriteLine($"======== Open AI - {nameof(UsingChatCompletionWithLocalInputAudioAndTextOutput)} ========\n");
97+
98+
var audioBytes = await EmbeddedResource.ReadAllAsync("test_audio.wav");
99+
100+
var kernel = Kernel.CreateBuilder()
101+
.AddOpenAIChatCompletion("gpt-4o-audio-preview", TestConfiguration.OpenAI.ApiKey)
102+
.Build();
103+
104+
var chatCompletionService = kernel.GetRequiredService<IChatCompletionService>();
105+
var settings = new OpenAIPromptExecutionSettings
106+
{
107+
Audio = new ChatAudioOptions(ChatOutputAudioVoice.Shimmer, ChatOutputAudioFormat.Mp3),
108+
Modalities = ChatResponseModalities.Text
109+
};
110+
111+
var chatHistory = new ChatHistory("You are a friendly assistant.");
112+
113+
chatHistory.AddUserMessage([new AudioContent(audioBytes, "audio/wav")]);
114+
115+
var result = await chatCompletionService.GetChatMessageContentAsync(chatHistory, settings);
116+
117+
// Now we need to get the audio content from the result
118+
Console.WriteLine($"Assistant > {result}");
119+
}
120+
}

dotnet/samples/Concepts/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ dotnet test -l "console;verbosity=detailed" --filter "FullyQualifiedName=ChatCom
7878
- [OpenAI_ChatCompletion](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_ChatCompletion.cs)
7979
- [OpenAI_ChatCompletionStreaming](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_ChatCompletionStreaming.cs)
8080
- [OpenAI_ChatCompletionWebSearch](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_ChatCompletionWebSearch.cs)
81+
- [OpenAI_ChatCompletionWithAudio](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_ChatCompletionWithAudio.cs)
8182
- [OpenAI_ChatCompletionWithReasoning](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_ChatCompletionWithReasoning.cs)
8283
- [OpenAI_ChatCompletionWithVision](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_ChatCompletionWithVision.cs)
8384
- [OpenAI_CustomClient](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/ChatCompletion/OpenAI_CustomClient.cs)

0 commit comments

Comments
 (0)