Refactor token usage and prompt generation

marcominerva · marcominerva · commit 99d25a74e2f4 · 2024-11-04T11:03:05.000+01:00
Enhanced token usage logging in AzureOpenAITextGenerator and OpenAITextGenerator with detailed information. Introduced new private fields `_deployment` and `_textModel` in respective classes and updated constructors accordingly. Modified `MemoryAnswer` to handle multiple `TokenUsage` instances. Expanded `TokenUsage` class with additional properties and renamed existing ones for clarity. In `SearchClient`, added `CreatePrompt` method and updated `GenerateAnswer` to use it, ensuring token usage is always populated.
diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs
@@ -1,5 +1,6 @@
 ﻿// Copyright (c) Microsoft. All rights reserved.
 
+using System;
 using System.Collections.Generic;
 using System.Diagnostics.CodeAnalysis;
 using System.Net.Http;
@@ -30,6 +31,8 @@ public sealed class AzureOpenAITextGenerator : ITextGenerator
     private readonly ITextTokenizer _textTokenizer;
     private readonly ILogger<AzureOpenAITextGenerator> _log;
 
+    private readonly string _deployment;
+
     /// <inheritdoc/>
     public int MaxTokenTotal { get; }
 
@@ -89,6 +92,7 @@ public AzureOpenAITextGenerator(
     {
         this._client = skClient;
         this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<AzureOpenAITextGenerator>();
+        this._deployment = config.Deployment;
         this.MaxTokenTotal = config.MaxTokenTotal;
 
         if (textTokenizer == null)
@@ -145,10 +149,28 @@ public IReadOnlyList<string> GetTokens(string text)
         IAsyncEnumerable<StreamingTextContent> result = this._client.GetStreamingTextContentsAsync(prompt, skOptions, cancellationToken: cancellationToken);
         await foreach (StreamingTextContent x in result)
         {
-            var tokenUsage = x.Metadata?["Usage"] is ChatTokenUsage { } usage
-                ? new TokenUsage { InputTokenCount = usage.InputTokenCount, OutputTokenCount = usage.OutputTokenCount, TotalTokenCount = usage.TotalTokenCount }
-                : null;
-
+            TokenUsage? tokenUsage = null;
+
+            if (x.Metadata?["Usage"] is ChatTokenUsage { } usage)
+            {
+                this._log.LogTrace("Usage report: input tokens {0}, output tokens {1}, output reasoning tokens {2}",
+                                  usage?.InputTokenCount, usage?.OutputTokenCount, usage?.OutputTokenDetails.ReasoningTokenCount);
+
+                tokenUsage = new TokenUsage
+                {
+                    Timestamp = DateTime.UtcNow,
+                    ServiceType = "Azure OpenAI",
+                    ModelType = "TextGeneration",
+                    ModelName = this._deployment,
+                    ServiceTokensIn = usage!.InputTokenCount,
+                    ServiceTokensOut = usage.OutputTokenCount,
+                    ServiceReasoningTokens = usage.OutputTokenDetails?.ReasoningTokenCount
+                };
+            }
+
+            // NOTE: as stated at https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices,
+            // The Choice can also be empty for the last chunk if we set stream_options: { "include_usage": true} to get token counts, so we can continue
+            // only if both x.Text and tokenUsage are null.
             if (x.Text is null && tokenUsage is null) { continue; }
 
             yield return (x.Text, tokenUsage);
diff --git a/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs b/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs
@@ -1,5 +1,6 @@
 ﻿// Copyright (c) Microsoft. All rights reserved.
 
+using System;
 using System.Collections.Generic;
 using System.Diagnostics.CodeAnalysis;
 using System.Net.Http;
@@ -30,6 +31,8 @@ public sealed class OpenAITextGenerator : ITextGenerator
     private readonly ITextTokenizer _textTokenizer;
     private readonly ILogger<OpenAITextGenerator> _log;
 
+    private readonly string _textModel;
+
     /// <inheritdoc/>
     public int MaxTokenTotal { get; }
 
@@ -88,6 +91,7 @@ public OpenAITextGenerator(
     {
         this._client = skClient;
         this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<OpenAITextGenerator>();
+        this._textModel = config.TextModel;
         this.MaxTokenTotal = config.TextModelMaxTokenTotal;
 
         if (textTokenizer == null)
@@ -144,10 +148,28 @@ public IReadOnlyList<string> GetTokens(string text)
         IAsyncEnumerable<StreamingTextContent> result = this._client.GetStreamingTextContentsAsync(prompt, skOptions, cancellationToken: cancellationToken);
         await foreach (StreamingTextContent x in result)
         {
-            var tokenUsage = x.Metadata?["Usage"] is ChatTokenUsage { } usage
-                ? new TokenUsage { InputTokenCount = usage.InputTokenCount, OutputTokenCount = usage.OutputTokenCount, TotalTokenCount = usage.TotalTokenCount }
-                : null;
-
+            TokenUsage? tokenUsage = null;
+
+            if (x.Metadata?["Usage"] is ChatTokenUsage { } usage)
+            {
+                this._log.LogTrace("Usage report: input tokens {0}, output tokens {1}, output reasoning tokens {2}",
+                                  usage?.InputTokenCount, usage?.OutputTokenCount, usage?.OutputTokenDetails.ReasoningTokenCount);
+
+                tokenUsage = new TokenUsage
+                {
+                    Timestamp = DateTime.UtcNow,
+                    ServiceType = "OpenAI",
+                    ModelType = "TextGeneration",
+                    ModelName = this._textModel,
+                    ServiceTokensIn = usage!.InputTokenCount,
+                    ServiceTokensOut = usage.OutputTokenCount,
+                    ServiceReasoningTokens = usage.OutputTokenDetails?.ReasoningTokenCount
+                };
+            }
+
+            // NOTE: as stated at https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices,
+            // The Choice can also be empty for the last chunk if we set stream_options: { "include_usage": true} to get token counts, so we can continue
+            // only if both x.Text and tokenUsage are null.
             if (x.Text is null && tokenUsage is null) { continue; }
 
             yield return (x.Text, tokenUsage);
diff --git a/service/Abstractions/Models/MemoryAnswer.cs b/service/Abstractions/Models/MemoryAnswer.cs
@@ -48,7 +48,7 @@ public class MemoryAnswer
     /// <remarks>Not all the models and text generators return token usage information.</remarks>
     [JsonPropertyName("tokenUsage")]
     [JsonPropertyOrder(11)]
-    public TokenUsage? TokenUsage { get; set; }
+    public IList<TokenUsage> TokenUsages { get; set; } = [];
 
     /// <summary>
     /// List of the relevant sources used to produce the answer.
diff --git a/service/Abstractions/Models/TokenUsage.cs b/service/Abstractions/Models/TokenUsage.cs
@@ -1,5 +1,6 @@
 ﻿// Copyright (c) Microsoft. All rights reserved.
 
+using System;
 using System.Text.Json.Serialization;
 
 namespace Microsoft.KernelMemory.Models;
@@ -9,21 +10,38 @@ namespace Microsoft.KernelMemory.Models;
 /// </summary>
 public class TokenUsage
 {
+    public DateTime Timestamp { get; set; }
+
+    public string? ServiceType { get; set; }
+
+    public string? ModelType { get; set; }
+
+    public string? ModelName { get; set; }
+
     /// <summary>
-    /// The number of tokens in the request message input, spanning all message content items.
+    /// The number of tokens in the request message input, spanning all message content items, measured by the tokenizer.
     /// </summary>
-    [JsonPropertyOrder(0)]
-    public int InputTokenCount { get; set; }
+    [JsonPropertyName("tokenizer_tokens_in")]
+    public int TokeninzerTokensIn { get; set; }
 
     /// <summary>
-    /// The combined number of output tokens in the generated completion, as consumed by the model.
+    /// The combined number of output tokens in the generated completion, measured by the tokenizer.
     /// </summary>
-    [JsonPropertyOrder(1)]
-    public int OutputTokenCount { get; set; }
+    [JsonPropertyName("tokenizer_tokens_out")]
+    public int TokeninzerTokensOut { get; set; }
 
     /// <summary>
-    /// The total number of combined input (prompt) and output (completion) tokens used.
+    /// The number of tokens in the request message input, spanning all message content items, measured by the service.
     /// </summary>
-    [JsonPropertyOrder(2)]
-    public int TotalTokenCount { get; set; }
+    [JsonPropertyName("service_tokens_in")]
+    public int? ServiceTokensIn { get; set; }
+
+    /// <summary>
+    /// The combined number of output tokens in the generated completion, as consumed by the model.
+    /// </summary>
+    [JsonPropertyName("service_tokens_out")]
+    public int? ServiceTokensOut { get; set; }
+
+    [JsonPropertyName("service_reasoning_tokens")]
+    public int? ServiceReasoningTokens { get; set; }
 }
diff --git a/service/Core/Search/SearchClient.cs b/service/Core/Search/SearchClient.cs
@@ -337,14 +337,16 @@ public async Task<MemoryAnswer> AskAsync(
             return noAnswerFound;
         }
 
+        var prompt = this.CreatePrompt(question, facts.ToString(), context);
+
         var text = new StringBuilder();
         TokenUsage? tokenUsage = null;
 
         var charsGenerated = 0;
         var watch = new Stopwatch();
         watch.Restart();
 
-        await foreach (var x in this.GenerateAnswer(question, facts.ToString(), context, cancellationToken).ConfigureAwait(false))
+        await foreach (var x in this.GenerateAnswer(prompt, context, cancellationToken).ConfigureAwait(false))
         {
             if (x.Text is not null)
             {
@@ -363,7 +365,13 @@ public async Task<MemoryAnswer> AskAsync(
         watch.Stop();
 
         answer.Result = text.ToString();
-        answer.TokenUsage = tokenUsage;
+
+        // If the service does not provide Token usage information, we explicitly create it.
+        tokenUsage ??= new TokenUsage { Timestamp = DateTime.UtcNow, ModelType = "TextGeneration" };
+        tokenUsage.TokeninzerTokensIn = this._textGenerator.CountTokens(prompt);
+        tokenUsage.TokeninzerTokensOut = this._textGenerator.CountTokens(answer.Result);
+
+        answer.TokenUsages.Add(tokenUsage);
 
         this._log.LogSensitive("Answer: {0}", answer.Result);
         answer.NoResult = ValueIsEquivalentTo(answer.Result, this._config.EmptyAnswer);
@@ -392,12 +400,9 @@ public async Task<MemoryAnswer> AskAsync(
         return answer;
     }
 
-    private IAsyncEnumerable<(string? Text, TokenUsage? TokenUsage)> GenerateAnswer(string question, string facts, IContext? context, CancellationToken token)
+    private string CreatePrompt(string question, string facts, IContext? context)
     {
         string prompt = context.GetCustomRagPromptOrDefault(this._answerPrompt);
-        int maxTokens = context.GetCustomRagMaxTokensOrDefault(this._config.AnswerTokens);
-        double temperature = context.GetCustomRagTemperatureOrDefault(this._config.Temperature);
-        double nucleusSampling = context.GetCustomRagNucleusSamplingOrDefault(this._config.TopP);
 
         prompt = prompt.Replace("{{$facts}}", facts.Trim(), StringComparison.OrdinalIgnoreCase);
 
@@ -406,6 +411,15 @@ public async Task<MemoryAnswer> AskAsync(
         prompt = prompt.Replace("{{$input}}", question, StringComparison.OrdinalIgnoreCase);
         prompt = prompt.Replace("{{$notFound}}", this._config.EmptyAnswer, StringComparison.OrdinalIgnoreCase);
 
+        return prompt;
+    }
+
+    private IAsyncEnumerable<(string? Text, TokenUsage? TokenUsage)> GenerateAnswer(string prompt, IContext? context, CancellationToken token)
+    {
+        int maxTokens = context.GetCustomRagMaxTokensOrDefault(this._config.AnswerTokens);
+        double temperature = context.GetCustomRagTemperatureOrDefault(this._config.Temperature);
+        double nucleusSampling = context.GetCustomRagNucleusSamplingOrDefault(this._config.TopP);
+
         var options = new TextGenerationOptions
         {
             MaxTokens = maxTokens,