Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness.
* [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack.
* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural
language processing tasks. Evaluators currently include BLEU score, with more planned.
language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores.
* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data.
* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container.
* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
Expand Down Expand Up @@ -77,18 +78,18 @@ public ValueTask<EvaluationResult> EvaluateAsync(
return new ValueTask<EvaluationResult>(result);
}

var (score, duration) = TimingHelper.ExecuteWithTiming(() =>
(double score, TimeSpan duration) = TimingHelper.ExecuteWithTiming(() =>
{
var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference));
var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text);
string[][] references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference).ToArray()).ToArray();
string[] hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text).ToArray();
return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4);
});

metric.Value = score;
string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText);
metric.AddOrUpdateContext(context);
metric.Interpretation = NLPScoreInterpretation.Interpret(metric);
metric.Interpretation = metric.Interpret();

return new ValueTask<EvaluationResult>(result);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ public sealed class BLEUEvaluatorContext : EvaluationContext
/// Gets the unique <see cref="EvaluationContext.Name"/> that is used for
/// <see cref="BLEUEvaluatorContext"/>.
/// </summary>
public static string BLEUContextName => "BLEU Context";
public static string ReferencesContextName => "References (BLEU)";

/// <summary>
/// Gets the reference responses against which the provided model response will be scored.
/// Gets the references against which the provided response will be scored.
/// </summary>
/// <remarks>
/// The <see cref="BLEUEvaluator"/> measures the degree to which the response being evaluated is similar to
Expand All @@ -41,8 +41,8 @@ public sealed class BLEUEvaluatorContext : EvaluationContext
/// <param name="references">
/// The reference responses against which the response that is being evaluated is compared.
/// </param>
public BLEUEvaluatorContext(params string[] references)
: this(references as IEnumerable<string>)
public BLEUEvaluatorContext(IEnumerable<string> references)
: this(references.ToArray())
{
}

Expand All @@ -52,11 +52,11 @@ public BLEUEvaluatorContext(params string[] references)
/// <param name="references">
/// The reference responses against which the response that is being evaluated is compared.
/// </param>
public BLEUEvaluatorContext(IEnumerable<string> references)
public BLEUEvaluatorContext(params string[] references)
: base(
name: BLEUContextName,
name: ReferencesContextName,
contents: [.. references.Select(c => new TextContent(c))])
{
References = [.. references];
References = references;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;
/// </summary>
internal static class BLEUAlgorithm
{
internal static int ClosestRefLength(IEnumerable<IEnumerable<string>> references, int hypLength)
internal static int ClosestRefLength(string[][] references, int hypLength)
{
if (!references.Any())
{
Expand All @@ -27,7 +27,7 @@ internal static int ClosestRefLength(IEnumerable<IEnumerable<string>> references
int smallestDiff = int.MaxValue;
foreach (var reference in references)
{
int refLength = reference.Count();
int refLength = reference.Length;
int diff = Math.Abs(refLength - hypLength);
if (diff < smallestDiff ||
(diff == smallestDiff && refLength < closestRefLength))
Expand Down Expand Up @@ -55,27 +55,27 @@ internal static double BrevityPenalty(int closestRefLength, int hypLength)
return Math.Exp(1 - ((double)closestRefLength / hypLength));
}

internal static RationalNumber ModifiedPrecision(IEnumerable<IEnumerable<string>> references, IEnumerable<string> hypothesis, int n = 1)
internal static RationalNumber ModifiedPrecision(string[][] references, string[] hypothesis, int n = 1)
{
if (n <= 0)
{
Throw.ArgumentOutOfRangeException(nameof(n), $"`{nameof(n)}` must be greater than zero.");
}

if (!references.Any() || !hypothesis.Any())
if (references.Length == 0 || hypothesis.Length == 0)
{
return RationalNumber.Zero;
}

var hyp = hypothesis.CreateNGrams(n);
var hypCounts = new MatchCounter<NGram<string>>(hyp);
List<NGram<string>> hypGrams = hypothesis.CreateNGrams(n);
MatchCounter<NGram<string>> hypCounts = new(hypGrams);

Dictionary<NGram<string>, int> maxCounts = [];

foreach (var rf in references)
{
IEnumerable<NGram<string>> refGrams = rf.CreateNGrams(n);
var refCounts = new MatchCounter<NGram<string>>(refGrams);
List<NGram<string>> refGrams = rf.CreateNGrams(n);
MatchCounter<NGram<string>> refCounts = new(refGrams);

foreach (var ct in refCounts)
{
Expand Down Expand Up @@ -123,25 +123,28 @@ internal static double[] EqualWeights(int n)
}

double[] weights = new double[n];
#if NET8_0_OR_GREATER
Array.Fill(weights, 1.0 / n);
#else
for (int i = 0; i < n; i++)
{
weights[i] = 1.0 / n;
}

#endif
return weights;
}

internal static readonly double[] DefaultBLEUWeights = EqualWeights(4);

internal static double SentenceBLEU(IEnumerable<IEnumerable<string>> references, IEnumerable<string> hypothesis,
internal static double SentenceBLEU(string[][] references, string[] hypothesis,
double[]? weights = null, Func<RationalNumber[], int, double[]>? smoothingFunction = null)
{
if (references == null || !references.Any())
if (references == null || references.Length == 0)
{
Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty.");
}

if (hypothesis == null || !hypothesis.Any())
if (hypothesis == null || hypothesis.Length == 0)
{
Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty.");
}
Expand Down Expand Up @@ -171,7 +174,7 @@ internal static double SentenceBLEU(IEnumerable<IEnumerable<string>> references,
precisionValues[i] = prec;
}

int hypLen = hypothesis.Count();
int hypLen = hypothesis.Length;
int closestRefLength = ClosestRefLength(references, hypLen);
double brevityPenalty = BrevityPenalty(closestRefLength, hypLen);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;

/// <summary>
/// F1 score for a response is the ratio of the number of shared words between the generated response
/// and the reference response. Python implementation reference
/// https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py.
/// </summary>
internal static class F1Algorithm
{
public static double CalculateF1Score(string[] groundTruth, string[] response)
{
if (groundTruth == null || groundTruth.Length == 0)
{
Throw.ArgumentNullException(nameof(groundTruth), $"'{nameof(groundTruth)}' cannot be null or empty.");
}

if (response == null || response.Length == 0)
{
Throw.ArgumentNullException(nameof(response), $"'{nameof(response)}' cannot be null or empty.");
}

MatchCounter<string> referenceTokens = new(groundTruth);
MatchCounter<string> predictionTokens = new(response);
MatchCounter<string> commonTokens = referenceTokens.Intersect(predictionTokens);
int numCommonTokens = commonTokens.Sum();

if (numCommonTokens == 0)
{
return 0.0; // F1 score is 0 if there are no common tokens
}
else
{
double precision = (double)numCommonTokens / response.Length;
double recall = (double)numCommonTokens / groundTruth.Length;
double f1 = (2.0 * precision * recall) / (precision + recall);
return f1;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;

/// <summary>
/// Google-BLEU (GLEU) algorithm implementation for evaluating the quality of a response.
/// Python implementation reference: https://www.nltk.org/api/nltk.translate.gleu_score.html.
/// </summary>
internal static class GLEUAlgorithm
{
internal static double SentenceGLEU(string[][] references, string[] hypothesis, int minN = 1, int maxN = 4)
{
if (references == null || references.Length == 0)
{
Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty.");
}

if (hypothesis == null || hypothesis.Length == 0)
{
Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty.");
}

MatchCounter<NGram<string>> hypNGrams = new(hypothesis.CreateAllNGrams(minN, maxN));
int truePosFalsePos = hypNGrams.Sum();

List<(int, int)> hypCounts = [];
foreach (var reference in references)
{
MatchCounter<NGram<string>> refNGrams = new(reference.CreateAllNGrams(minN, maxN));
int truePosFalseNeg = refNGrams.Sum();

MatchCounter<NGram<string>> overlapNGrams = hypNGrams.Intersect(refNGrams);
int truePos = overlapNGrams.Sum();

int nAll = Math.Max(truePosFalsePos, truePosFalseNeg);

if (nAll > 0)
{
hypCounts.Add((truePos, nAll));
}
}

int corpusNMatch = 0;
int corpusNAll = 0;

foreach (var (truePos, nAll) in hypCounts)
{
corpusNMatch += truePos;
corpusNAll += nAll;
}

if (corpusNAll == 0)
{
return 0.0;
}
else
{
return (double)corpusNMatch / corpusNAll;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,26 @@ public void AddRange(IEnumerable<T> items)
}
}

public string ToDebugString() => string.Concat(_counts.Select(v => $"{v.Key}: {v.Value}, "));
public MatchCounter<T> Intersect(MatchCounter<T> other)
{
_ = Throw.IfNull(other, nameof(other));
var intersection = new MatchCounter<T>();

(Dictionary<T, int> smaller, Dictionary<T, int> larger) =
_counts.Count < other._counts.Count ? (_counts, other._counts) : (other._counts, _counts);

foreach (var kvp in smaller)
{
if (larger.TryGetValue(kvp.Key, out int otherCount))
{
intersection._counts[kvp.Key] = Math.Min(kvp.Value, otherCount);
}
}

return intersection;
}

public string ToDebugString() => string.Join(",", _counts.Select(v => $"{v.Key}: {v.Value}"));

public IEnumerator<KeyValuePair<T, int>> GetEnumerator() => _counts.GetEnumerator();

Expand Down
Loading
Loading