Skip to content

Commit 694b95e

Browse files
Introduce evaluators for agentic workflows (#6514)
Introduces the following new evaluators as part of the Quality package: `ToolCallAccuracyEvaluator`, `TaskRelevanceEvaluator` and `IntentResolutionEvalutor`, all currently marked `[Experimental]`. Also includes following changes: * Fixes a regex bug that was causing reasoning and chain of thought outputs present in the evaluation response to not be parsed correctly into the corresponding metrics. * Adds support for displaying tool calls and tool results in the conversation displayed in the report. This fixes #6370 * Adds support for displaying JSON content both in the conversation as well as in context (along with a new settings toggle for controlling pretty printing for the displayed JSON). * Adds tests for the new evaluators. Fixes #6350 Fixes #6370
1 parent 0204116 commit 694b95e

34 files changed

+2580
-287
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Collections.Generic;
5+
using System.Linq;
6+
using System.Text.Json;
7+
using System.Text.Json.Nodes;
8+
using Microsoft.Shared.Diagnostics;
9+
10+
namespace Microsoft.Extensions.AI.Evaluation.Quality;
11+
12+
internal static class AIToolExtensions
13+
{
14+
internal static string RenderAsJson(
15+
this IEnumerable<AITool> toolDefinitions,
16+
JsonSerializerOptions? options = null)
17+
{
18+
_ = Throw.IfNull(toolDefinitions);
19+
20+
var toolDefinitionsJsonArray = new JsonArray();
21+
22+
foreach (AIFunction function in toolDefinitions.OfType<AIFunction>())
23+
{
24+
JsonNode functionJsonNode =
25+
new JsonObject
26+
{
27+
["name"] = function.Name,
28+
["description"] = function.Description,
29+
["functionSchema"] = JsonNode.Parse(function.JsonSchema.GetRawText()),
30+
};
31+
32+
if (function.ReturnJsonSchema is not null)
33+
{
34+
functionJsonNode["functionReturnValueSchema"] =
35+
JsonNode.Parse(function.ReturnJsonSchema.Value.GetRawText());
36+
}
37+
38+
toolDefinitionsJsonArray.Add(functionJsonNode);
39+
}
40+
41+
string renderedToolDefinitions = toolDefinitionsJsonArray.ToJsonString(options);
42+
return renderedToolDefinitions;
43+
}
44+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Collections.Generic;
5+
using System.Text.Json;
6+
using System.Text.Json.Nodes;
7+
using Microsoft.Shared.Diagnostics;
8+
9+
namespace Microsoft.Extensions.AI.Evaluation.Quality;
10+
11+
internal static class ChatMessageExtensions
12+
{
13+
internal static string RenderAsJson(this IEnumerable<ChatMessage> messages, JsonSerializerOptions? options = null)
14+
{
15+
_ = Throw.IfNull(messages);
16+
17+
var messagesJsonArray = new JsonArray();
18+
19+
foreach (ChatMessage message in messages)
20+
{
21+
JsonNode? messageJsonNode =
22+
JsonSerializer.SerializeToNode(
23+
message,
24+
AIJsonUtilities.DefaultOptions.GetTypeInfo(typeof(ChatMessage)));
25+
26+
if (messageJsonNode is not null)
27+
{
28+
messagesJsonArray.Add(messageJsonNode);
29+
}
30+
}
31+
32+
string renderedMessages = messagesJsonArray.ToJsonString(options);
33+
return renderedMessages;
34+
}
35+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.Linq;
6+
using System.Text.Json;
7+
using System.Text.Json.Nodes;
8+
using Microsoft.Shared.Diagnostics;
9+
10+
namespace Microsoft.Extensions.AI.Evaluation.Quality;
11+
12+
internal static class ChatResponseExtensions
13+
{
14+
internal static string RenderAsJson(this ChatResponse modelResponse, JsonSerializerOptions? options = null)
15+
{
16+
_ = Throw.IfNull(modelResponse);
17+
18+
return modelResponse.Messages.RenderAsJson(options);
19+
}
20+
21+
internal static string RenderToolCallsAndResultsAsJson(
22+
this ChatResponse modelResponse,
23+
JsonSerializerOptions? options = null)
24+
{
25+
_ = Throw.IfNull(modelResponse);
26+
27+
var toolCallsAndResultsJsonArray = new JsonArray();
28+
29+
foreach (AIContent content in modelResponse.Messages.SelectMany(m => m.Contents))
30+
{
31+
if (content is FunctionCallContent or FunctionResultContent)
32+
{
33+
Type contentType =
34+
content is FunctionCallContent ? typeof(FunctionCallContent) : typeof(FunctionResultContent);
35+
36+
JsonNode? toolCallOrResultJsonNode =
37+
JsonSerializer.SerializeToNode(
38+
content,
39+
AIJsonUtilities.DefaultOptions.GetTypeInfo(contentType));
40+
41+
if (toolCallOrResultJsonNode is not null)
42+
{
43+
toolCallsAndResultsJsonArray.Add(toolCallOrResultJsonNode);
44+
}
45+
}
46+
}
47+
48+
string renderedToolCallsAndResults = toolCallsAndResultsJsonArray.ToJsonString(options);
49+
return renderedToolCallsAndResults;
50+
}
51+
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(
8787
{
8888
metric.AddDiagnostics(
8989
EvaluationDiagnostic.Error(
90-
$"A value of type '{nameof(CompletenessEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."));
90+
$"A value of type {nameof(CompletenessEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."));
9191

9292
return result;
9393
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(
8686
{
8787
metric.AddDiagnostics(
8888
EvaluationDiagnostic.Error(
89-
$"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."));
89+
$"A value of type {nameof(EquivalenceEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."));
9090

9191
return result;
9292
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,25 @@ internal static EvaluationMetricInterpretation InterpretScore(this NumericMetric
3333
: new EvaluationMetricInterpretation(rating);
3434
}
3535

36+
internal static EvaluationMetricInterpretation InterpretScore(
37+
this BooleanMetric metric,
38+
bool passValue = true)
39+
{
40+
EvaluationRating rating = metric.Value switch
41+
{
42+
null => EvaluationRating.Inconclusive,
43+
true => passValue ? EvaluationRating.Exceptional : EvaluationRating.Unacceptable,
44+
false => passValue ? EvaluationRating.Unacceptable : EvaluationRating.Exceptional,
45+
};
46+
47+
return metric.Value is bool value && value == passValue
48+
? new EvaluationMetricInterpretation(rating)
49+
: new EvaluationMetricInterpretation(
50+
rating,
51+
failed: true,
52+
reason: $"{metric.Name} is not {passValue}.");
53+
}
54+
3655
internal static bool TryParseEvaluationResponseWithValue<T>(
3756
this EvaluationMetric<T> metric,
3857
ChatResponse evaluationResponse,
@@ -81,7 +100,7 @@ internal static bool TryParseEvaluationResponseWithTags<T>(
81100

82101
static bool TryParseTag(string text, string tagName, [NotNullWhen(true)] out string? tagValue)
83102
{
84-
const RegexOptions Options = RegexOptions.Multiline;
103+
const RegexOptions Options = RegexOptions.Singleline;
85104
Match match = Regex.Match(text, $@"<{tagName}>(?<value>.*?)</{tagName}>", Options);
86105

87106
if (!match.Success || match.Groups["value"] is not Group valueGroup || !valueGroup.Success)
@@ -131,6 +150,11 @@ private static bool TryParseValue<T>(this EvaluationMetric<T> metric, string val
131150
booleanMetric.Value = booleanValue;
132151
return true;
133152
}
153+
else if (int.TryParse(valueText, out int intValue) && (intValue is 0 or 1))
154+
{
155+
booleanMetric.Value = intValue is 1;
156+
return true;
157+
}
134158
else
135159
{
136160
metric.AddDiagnostics(

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(
8585
{
8686
metric.AddDiagnostics(
8787
EvaluationDiagnostic.Error(
88-
$"A value of type '{nameof(GroundednessEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."));
88+
$"A value of type {nameof(GroundednessEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."));
8989

9090
return result;
9191
}

0 commit comments

Comments
 (0)