Skip to content

Commit 6204b0b

Browse files
Fix keys for EvaluationResult.Metrics dictionary to reflect the correct metric names for Safety evaluators (#6361)
This was an unfortunate regression that was introduced during a recent refactoring. The metrics returned from the Azure AI Foundry Evaluation service have different names than the ones we use in the Safety library. We translate the EvaluationMetric.Name of the metrics returned by the service to the more display friendly names used in the library before returning the metrics to the caller. While the returned metrics were correctly patched up, the EvaluationResult.Metrics dictionary still stored these metrics by the original names returned from the service. Unfortunately, this meant EvaluationResult.Get would throw an exception when trying to fetch metric with name ViolenceEvaluator.ViolenceMetricName. The fix in this commit fixes the keys in the dictionary as well. This commit also updates tests to cover the case being fixed. Fixes #6360
1 parent 13d0313 commit 6204b0b

File tree

5 files changed

+156
-18
lines changed

5 files changed

+156
-18
lines changed

src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyEvaluator.cs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,13 @@ await TimingHelper.ExecuteWithTimingAsync(() =>
151151
string annotationResult = annotationResponse.Text;
152152
EvaluationResult result = ContentSafetyService.ParseAnnotationResult(annotationResult);
153153

154-
UpdateMetrics();
154+
EvaluationResult updatedResult = UpdateMetrics();
155+
return updatedResult;
155156

156-
return result;
157-
158-
void UpdateMetrics()
157+
EvaluationResult UpdateMetrics()
159158
{
159+
EvaluationResult updatedResult = new EvaluationResult();
160+
160161
foreach (EvaluationMetric metric in result.Metrics.Values)
161162
{
162163
string contentSafetyServiceMetricName = metric.Name;
@@ -185,7 +186,11 @@ void UpdateMetrics()
185186
// metric.LogJsonData(payload);
186187
// metric.LogJsonData(annotationResult);
187188
#pragma warning restore S125
189+
190+
updatedResult.Metrics.Add(metric.Name, metric);
188191
}
192+
193+
return updatedResult;
189194
}
190195
}
191196

src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentSafetyService.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ internal static EvaluationResult ParseAnnotationResult(string annotationResponse
120120
}
121121
}
122122

123-
result.Metrics[metric.Name] = metric;
123+
result.Metrics.Add(metric.Name, metric);
124124
}
125125

126126
return result;

src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ProtectedMaterialEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ await EvaluateContentSafetyAsync(
100100

101101
foreach (EvaluationMetric imageMetric in imageResult.Metrics.Values)
102102
{
103-
result.Metrics[imageMetric.Name] = imageMetric;
103+
result.Metrics.Add(imageMetric.Name, imageMetric);
104104
}
105105
}
106106

test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests;
2020

21+
[Experimental("AIEVAL001")]
2122
public class QualityEvaluatorTests
2223
{
2324
private static readonly ChatOptions? _chatOptions;
@@ -47,9 +48,7 @@ static QualityEvaluatorTests()
4748
string temperature = $"Temperature: {_chatOptions.Temperature}";
4849
string usesContext = $"Feature: Context";
4950

50-
#pragma warning disable AIEVAL001
5151
IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator();
52-
#pragma warning restore AIEVAL001
5352

5453
IEvaluator coherenceEvaluator = new CoherenceEvaluator();
5554
IEvaluator fluencyEvaluator = new FluencyEvaluator();
@@ -101,6 +100,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync(
101100
Assert.False(
102101
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
103102
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
103+
104+
Assert.Equal(6, result.Metrics.Count);
105+
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _));
106+
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _));
107+
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _));
108+
Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _));
109+
Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _));
110+
Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _));
104111
}
105112

106113
[ConditionalFact]
@@ -132,6 +139,14 @@ await _qualityReportingConfiguration.CreateScenarioRunAsync(
132139
Assert.False(
133140
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
134141
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
142+
143+
Assert.Equal(6, result.Metrics.Count);
144+
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName, out NumericMetric? _));
145+
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.TruthMetricName, out NumericMetric? _));
146+
Assert.True(result.TryGet(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName, out NumericMetric? _));
147+
Assert.True(result.TryGet(CoherenceEvaluator.CoherenceMetricName, out NumericMetric? _));
148+
Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _));
149+
Assert.True(result.TryGet(RelevanceEvaluator.RelevanceMetricName, out NumericMetric? _));
135150
#if NET
136151
});
137152
#else
@@ -161,6 +176,17 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync(
161176
Assert.True(
162177
result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)),
163178
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
179+
180+
Assert.Equal(4, result.Metrics.Count);
181+
Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness));
182+
Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence));
183+
Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness));
184+
Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval));
185+
186+
Assert.Null(groundedness.Context);
187+
Assert.Null(equivalence.Context);
188+
Assert.Null(completeness.Context);
189+
Assert.Null(retrieval.Context);
164190
}
165191

166192
[ConditionalFact]
@@ -224,6 +250,32 @@ await scenarioRun.EvaluateAsync(
224250
groundingContextForGroundednessEvaluator,
225251
groundTruthForCompletenessEvaluator,
226252
retrievedContextChunksForRetrievalEvaluator]);
253+
254+
Assert.Equal(4, result.Metrics.Count);
255+
Assert.True(result.TryGet(GroundednessEvaluator.GroundednessMetricName, out NumericMetric? groundedness));
256+
Assert.True(result.TryGet(EquivalenceEvaluator.EquivalenceMetricName, out NumericMetric? equivalence));
257+
Assert.True(result.TryGet(CompletenessEvaluator.CompletenessMetricName, out NumericMetric? completeness));
258+
Assert.True(result.TryGet(RetrievalEvaluator.RetrievalMetricName, out NumericMetric? retrieval));
259+
260+
Assert.True(
261+
groundedness.Context?.Count is 1 &&
262+
groundedness.Context.TryGetValue(GroundednessEvaluatorContext.GroundingContextName, out EvaluationContext? context1) &&
263+
ReferenceEquals(context1, groundingContextForGroundednessEvaluator));
264+
265+
Assert.True(
266+
equivalence.Context?.Count is 1 &&
267+
equivalence.Context.TryGetValue(EquivalenceEvaluatorContext.GroundTruthContextName, out EvaluationContext? context2) &&
268+
ReferenceEquals(context2, baselineResponseForEquivalenceEvaluator));
269+
270+
Assert.True(
271+
completeness.Context?.Count is 1 &&
272+
completeness.Context.TryGetValue(CompletenessEvaluatorContext.GroundTruthContextName, out EvaluationContext? context3) &&
273+
ReferenceEquals(context3, groundTruthForCompletenessEvaluator));
274+
275+
Assert.True(
276+
retrieval.Context?.Count is 1 &&
277+
retrieval.Context.TryGetValue(RetrievalEvaluatorContext.RetrievedContextChunksContextName, out EvaluationContext? context4) &&
278+
ReferenceEquals(context4, retrievedContextChunksForRetrievalEvaluator));
227279
}
228280

229281
[MemberNotNull(nameof(_qualityReportingConfiguration))]

test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,17 +153,33 @@ At its furthest point (conjunction), Mars is about 250 million miles from Earth.
153153
The distance varies due to the elliptical orbits of both planets.
154154
""";
155155

156-
IEnumerable<EvaluationContext> additionalContext =
157-
[
158-
new GroundednessProEvaluatorContext(groundingContext),
159-
new UngroundedAttributesEvaluatorContext(groundingContext)
160-
];
156+
GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext);
157+
UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext);
158+
IEnumerable<EvaluationContext> additionalContext = [groundednessProContext, ungroundedAttributesContext];
161159

162160
EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response, additionalContext);
163161

164162
Assert.False(
165163
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
166164
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
165+
166+
Assert.Equal(6, result.Metrics.Count);
167+
Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _));
168+
Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _));
169+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
170+
Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro));
171+
Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes));
172+
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
173+
174+
Assert.True(
175+
groundednessPro.Context?.Count is 1 &&
176+
groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) &&
177+
ReferenceEquals(context1, groundednessProContext));
178+
179+
Assert.True(
180+
ungroundedAttributes.Context?.Count is 1 &&
181+
ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) &&
182+
ReferenceEquals(context2, ungroundedAttributesContext));
167183
}
168184

169185
[ConditionalFact]
@@ -212,17 +228,33 @@ At its closest (opposition), Jupiter is about 365 million miles away.
212228
At its furthest (conjunction), it can be approximately 601 million miles away.
213229
""";
214230

215-
IEnumerable<EvaluationContext> additionalContext =
216-
[
217-
new GroundednessProEvaluatorContext(groundingContext),
218-
new UngroundedAttributesEvaluatorContext(groundingContext)
219-
];
231+
GroundednessProEvaluatorContext groundednessProContext = new GroundednessProEvaluatorContext(groundingContext);
232+
UngroundedAttributesEvaluatorContext ungroundedAttributesContext = new UngroundedAttributesEvaluatorContext(groundingContext);
233+
IEnumerable<EvaluationContext> additionalContext = [groundednessProContext, ungroundedAttributesContext];
220234

221235
EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response2, additionalContext);
222236

223237
Assert.False(
224238
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
225239
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
240+
241+
Assert.Equal(6, result.Metrics.Count);
242+
Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _));
243+
Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _));
244+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
245+
Assert.True(result.TryGet(GroundednessProEvaluator.GroundednessProMetricName, out NumericMetric? groundednessPro));
246+
Assert.True(result.TryGet(UngroundedAttributesEvaluator.UngroundedAttributesMetricName, out BooleanMetric? ungroundedAttributes));
247+
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
248+
249+
Assert.True(
250+
groundednessPro.Context?.Count is 1 &&
251+
groundednessPro.Context.TryGetValue(GroundednessProEvaluatorContext.GroundingContextName, out EvaluationContext? context1) &&
252+
ReferenceEquals(context1, groundednessProContext));
253+
254+
Assert.True(
255+
ungroundedAttributes.Context?.Count is 1 &&
256+
ungroundedAttributes.Context.TryGetValue(UngroundedAttributesEvaluatorContext.GroundingContextName, out EvaluationContext? context2) &&
257+
ReferenceEquals(context2, ungroundedAttributesContext));
226258
}
227259

228260
[ConditionalFact]
@@ -250,6 +282,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync(
250282
Assert.False(
251283
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
252284
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
285+
286+
Assert.Equal(7, result.Metrics.Count);
287+
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
288+
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
289+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
290+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
291+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
292+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
293+
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
253294
}
254295

255296
[ConditionalFact]
@@ -277,6 +318,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync(
277318
Assert.False(
278319
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
279320
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
321+
322+
Assert.Equal(7, result.Metrics.Count);
323+
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
324+
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
325+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
326+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
327+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
328+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
329+
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
280330
}
281331

282332
[ConditionalFact]
@@ -317,6 +367,15 @@ await _imageContentSafetyReportingConfiguration.CreateScenarioRunAsync(
317367
Assert.False(
318368
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
319369
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
370+
371+
Assert.Equal(7, result.Metrics.Count);
372+
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
373+
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
374+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
375+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
376+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
377+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
378+
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
320379
}
321380

322381
[ConditionalFact]
@@ -370,6 +429,15 @@ These distances are approximate and can vary slightly depending on the specific
370429
Assert.False(
371430
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
372431
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
432+
433+
Assert.Equal(7, result.Metrics.Count);
434+
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
435+
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
436+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedMaterialMetricName, out BooleanMetric? _));
437+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedArtworkMetricName, out BooleanMetric? _));
438+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedFictionalCharactersMetricName, out BooleanMetric? _));
439+
Assert.True(result.TryGet(ProtectedMaterialEvaluator.ProtectedLogosAndBrandsMetricName, out BooleanMetric? _));
440+
Assert.True(result.TryGet(IndirectAttackEvaluator.IndirectAttackMetricName, out BooleanMetric? _));
373441
}
374442

375443
[ConditionalFact]
@@ -396,6 +464,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync(
396464
Assert.False(
397465
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
398466
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
467+
468+
Assert.Single(result.Metrics);
469+
Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _));
399470
}
400471

401472
[ConditionalFact]
@@ -434,6 +505,9 @@ await _codeVulnerabilityReportingConfiguration.CreateScenarioRunAsync(
434505
Assert.False(
435506
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
436507
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
508+
509+
Assert.Single(result.Metrics);
510+
Assert.True(result.TryGet(CodeVulnerabilityEvaluator.CodeVulnerabilityMetricName, out BooleanMetric? _));
437511
}
438512

439513
[ConditionalFact]
@@ -465,6 +539,13 @@ await _mixedQualityAndSafetyReportingConfiguration.CreateScenarioRunAsync(
465539
Assert.False(
466540
result.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error),
467541
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
542+
543+
Assert.Equal(5, result.Metrics.Count);
544+
Assert.True(result.TryGet(FluencyEvaluator.FluencyMetricName, out NumericMetric? _));
545+
Assert.True(result.TryGet(HateAndUnfairnessEvaluator.HateAndUnfairnessMetricName, out NumericMetric? _));
546+
Assert.True(result.TryGet(SelfHarmEvaluator.SelfHarmMetricName, out NumericMetric? _));
547+
Assert.True(result.TryGet(SexualEvaluator.SexualMetricName, out NumericMetric? _));
548+
Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _));
468549
}
469550

470551
[MemberNotNull(nameof(_contentSafetyReportingConfiguration))]

0 commit comments

Comments
 (0)