🚀 feat: Agent Cache Tokens & Anthropic Reasoning Support (#6098)

danny-avila · danny-avila · commit 076f99f3c8f6 · 2025-02-27T19:49:34.000-05:00
* fix: handling of top_k and top_p parameters for Claude-3.7 models (allowed without reasoning)

* feat: bump @librechat/agents for Anthropic Reasoning support

* fix: update reasoning handling for OpenRouter integration

* fix: enhance agent token spending logic to include cache creation and read details

* fix: update logic for thinking status in ContentParts component

* refactor: improve agent title handling

* chore: bump @librechat/agents to version 2.1.7 for parallel tool calling for Google models
diff --git a/api/app/clients/AnthropicClient.js b/api/app/clients/AnthropicClient.js
@@ -746,15 +746,6 @@ class AnthropicClient extends BaseClient {
       metadata,
     };
 
-    if (!/claude-3[-.]7/.test(model)) {
-      if (top_p !== undefined) {
-        requestOptions.top_p = top_p;
-      }
-      if (top_k !== undefined) {
-        requestOptions.top_k = top_k;
-      }
-    }
-
     if (this.useMessages) {
       requestOptions.messages = payload;
       requestOptions.max_tokens =
@@ -769,6 +760,14 @@ class AnthropicClient extends BaseClient {
       thinkingBudget: this.options.thinkingBudget,
     });
 
+    if (!/claude-3[-.]7/.test(model)) {
+      requestOptions.top_p = top_p;
+      requestOptions.top_k = top_k;
+    } else if (requestOptions.thinking == null) {
+      requestOptions.topP = top_p;
+      requestOptions.topK = top_k;
+    }
+
     if (this.systemMessage && this.supportsCacheControl === true) {
       requestOptions.system = [
         {
diff --git a/api/app/clients/OpenAIClient.js b/api/app/clients/OpenAIClient.js
@@ -1309,6 +1309,12 @@ ${convo}
         modelOptions.include_reasoning = true;
         reasoningKey = 'reasoning';
       }
+      if (this.useOpenRouter && modelOptions.reasoning_effort != null) {
+        modelOptions.reasoning = {
+          effort: modelOptions.reasoning_effort,
+        };
+        delete modelOptions.reasoning_effort;
+      }
 
       this.streamHandler = new SplitStreamHandler({
         reasoningKey,
diff --git a/api/app/clients/specs/AnthropicClient.test.js b/api/app/clients/specs/AnthropicClient.test.js
@@ -680,4 +680,53 @@ describe('AnthropicClient', () => {
       expect(capturedOptions).not.toHaveProperty('top_p');
     });
   });
+
+  it('should include top_k and top_p parameters for Claude-3.7 models when thinking is explicitly disabled', async () => {
+    const client = new AnthropicClient('test-api-key', {
+      modelOptions: {
+        model: 'claude-3-7-sonnet',
+        temperature: 0.7,
+        topK: 10,
+        topP: 0.9,
+      },
+      thinking: false,
+    });
+
+    async function* mockAsyncGenerator() {
+      yield { type: 'message_start', message: { usage: {} } };
+      yield { delta: { text: 'Test response' } };
+      yield { type: 'message_delta', usage: {} };
+    }
+
+    jest.spyOn(client, 'createResponse').mockImplementation(() => {
+      return mockAsyncGenerator();
+    });
+
+    let capturedOptions = null;
+    jest.spyOn(client, 'getClient').mockImplementation((options) => {
+      capturedOptions = options;
+      return {};
+    });
+
+    const payload = [{ role: 'user', content: 'Test message' }];
+    await client.sendCompletion(payload, {});
+
+    expect(capturedOptions).toHaveProperty('topK', 10);
+    expect(capturedOptions).toHaveProperty('topP', 0.9);
+
+    client.setOptions({
+      modelOptions: {
+        model: 'claude-3.7-sonnet',
+        temperature: 0.7,
+        topK: 10,
+        topP: 0.9,
+      },
+      thinking: false,
+    });
+
+    await client.sendCompletion(payload, {});
+
+    expect(capturedOptions).toHaveProperty('topK', 10);
+    expect(capturedOptions).toHaveProperty('topP', 0.9);
+  });
 });
diff --git a/api/package.json b/api/package.json
@@ -46,7 +46,7 @@
     "@langchain/google-genai": "^0.1.9",
     "@langchain/google-vertexai": "^0.2.0",
     "@langchain/textsplitters": "^0.1.0",
-    "@librechat/agents": "^2.1.3",
+    "@librechat/agents": "^2.1.7",
     "@waylaidwanderer/fetch-event-source": "^3.0.1",
     "axios": "1.7.8",
     "bcryptjs": "^2.4.3",
diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js
@@ -27,10 +27,10 @@ const {
   formatContentStrings,
   createContextHandlers,
 } = require('~/app/clients/prompts');
-const { encodeAndFormat } = require('~/server/services/Files/images/encode');
+const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
 const { getBufferString, HumanMessage } = require('@langchain/core/messages');
+const { encodeAndFormat } = require('~/server/services/Files/images/encode');
 const Tokenizer = require('~/server/services/Tokenizer');
-const { spendTokens } = require('~/models/spendTokens');
 const BaseClient = require('~/app/clients/BaseClient');
 const { getCurrentDateTime } = require('~/utils');
 const { createRun } = require('./run');
@@ -380,32 +380,61 @@ class AgentClient extends BaseClient {
     if (!collectedUsage || !collectedUsage.length) {
       return;
     }
-    const input_tokens = collectedUsage[0]?.input_tokens || 0;
+    const input_tokens =
+      (collectedUsage[0]?.input_tokens || 0) +
+      (Number(collectedUsage[0]?.input_token_details?.cache_creation) || 0) +
+      (Number(collectedUsage[0]?.input_token_details?.cache_read) || 0);
 
     let output_tokens = 0;
     let previousTokens = input_tokens; // Start with original input
     for (let i = 0; i < collectedUsage.length; i++) {
       const usage = collectedUsage[i];
+      if (!usage) {
+        continue;
+      }
+
+      const cache_creation = Number(usage.input_token_details?.cache_creation) || 0;
+      const cache_read = Number(usage.input_token_details?.cache_read) || 0;
+
+      const txMetadata = {
+        context,
+        conversationId: this.conversationId,
+        user: this.user ?? this.options.req.user?.id,
+        endpointTokenConfig: this.options.endpointTokenConfig,
+        model: usage.model ?? model ?? this.model ?? this.options.agent.model_parameters.model,
+      };
+
       if (i > 0) {
         // Count new tokens generated (input_tokens minus previous accumulated tokens)
-        output_tokens += (Number(usage.input_tokens) || 0) - previousTokens;
+        output_tokens +=
+          (Number(usage.input_tokens) || 0) + cache_creation + cache_read - previousTokens;
       }
 
       // Add this message's output tokens
       output_tokens += Number(usage.output_tokens) || 0;
 
       // Update previousTokens to include this message's output
       previousTokens += Number(usage.output_tokens) || 0;
-      spendTokens(
-        {
-          context,
-          conversationId: this.conversationId,
-          user: this.user ?? this.options.req.user?.id,
-          endpointTokenConfig: this.options.endpointTokenConfig,
-          model: usage.model ?? model ?? this.model ?? this.options.agent.model_parameters.model,
-        },
-        { promptTokens: usage.input_tokens, completionTokens: usage.output_tokens },
-      ).catch((err) => {
+
+      if (cache_creation > 0 || cache_read > 0) {
+        spendStructuredTokens(txMetadata, {
+          promptTokens: {
+            input: usage.input_tokens,
+            write: cache_creation,
+            read: cache_read,
+          },
+          completionTokens: usage.output_tokens,
+        }).catch((err) => {
+          logger.error(
+            '[api/server/controllers/agents/client.js #recordCollectedUsage] Error spending structured tokens',
+            err,
+          );
+        });
+      }
+      spendTokens(txMetadata, {
+        promptTokens: usage.input_tokens,
+        completionTokens: usage.output_tokens,
+      }).catch((err) => {
         logger.error(
           '[api/server/controllers/agents/client.js #recordCollectedUsage] Error spending tokens',
           err,
@@ -794,7 +823,10 @@ class AgentClient extends BaseClient {
       throw new Error('Run not initialized');
     }
     const { handleLLMEnd, collected: collectedMetadata } = createMetadataAggregator();
-    const clientOptions = {};
+    /** @type {import('@librechat/agents').ClientOptions} */
+    const clientOptions = {
+      maxTokens: 75,
+    };
     const providerConfig = this.options.req.app.locals[this.options.agent.provider];
     if (
       providerConfig &&
diff --git a/api/server/services/Endpoints/agents/title.js b/api/server/services/Endpoints/agents/title.js
@@ -20,10 +20,19 @@ const addTitle = async (req, { text, response, client }) => {
 
   const titleCache = getLogStores(CacheKeys.GEN_TITLE);
   const key = `${req.user.id}-${response.conversationId}`;
+  const responseText =
+    response?.content && Array.isArray(response?.content)
+      ? response.content.reduce((acc, block) => {
+        if (block?.type === 'text') {
+          return acc + block.text;
+        }
+        return acc;
+      }, '')
+      : (response?.content ?? response?.text ?? '');
 
   const title = await client.titleConvo({
     text,
-    responseText: response?.text ?? '',
+    responseText,
     conversationId: response.conversationId,
   });
   await titleCache.set(key, title, 120000);
diff --git a/api/server/services/Endpoints/anthropic/llm.js b/api/server/services/Endpoints/anthropic/llm.js
@@ -1,6 +1,6 @@
 const { HttpsProxyAgent } = require('https-proxy-agent');
 const { anthropicSettings, removeNullishValues } = require('librechat-data-provider');
-const { checkPromptCacheSupport, getClaudeHeaders } = require('./helpers');
+const { checkPromptCacheSupport, getClaudeHeaders, configureReasoning } = require('./helpers');
 
 /**
  * Generates configuration options for creating an Anthropic language model (LLM) instance.
@@ -49,13 +49,14 @@ function getLLMConfig(apiKey, options = {}) {
     clientOptions: {},
   };
 
+  requestOptions = configureReasoning(requestOptions, systemOptions);
+
   if (!/claude-3[-.]7/.test(mergedOptions.model)) {
-    if (mergedOptions.topP !== undefined) {
-      requestOptions.topP = mergedOptions.topP;
-    }
-    if (mergedOptions.topK !== undefined) {
-      requestOptions.topK = mergedOptions.topK;
-    }
+    requestOptions.topP = mergedOptions.topP;
+    requestOptions.topK = mergedOptions.topK;
+  } else if (requestOptions.thinking == null) {
+    requestOptions.topP = mergedOptions.topP;
+    requestOptions.topK = mergedOptions.topK;
   }
 
   const supportsCacheControl =
diff --git a/api/server/services/Endpoints/anthropic/llm.spec.js b/api/server/services/Endpoints/anthropic/llm.spec.js
@@ -109,4 +109,45 @@ describe('getLLMConfig', () => {
     // Just verifying that the promptCache setting is processed
     expect(result.llmConfig).toBeDefined();
   });
+
+  it('should include topK and topP for Claude-3.7 models when thinking is not enabled', () => {
+    // Test with thinking explicitly set to null/undefined
+    const result = getLLMConfig('test-api-key', {
+      modelOptions: {
+        model: 'claude-3-7-sonnet',
+        topK: 10,
+        topP: 0.9,
+        thinking: false,
+      },
+    });
+
+    expect(result.llmConfig).toHaveProperty('topK', 10);
+    expect(result.llmConfig).toHaveProperty('topP', 0.9);
+
+    // Test with thinking explicitly set to false
+    const result2 = getLLMConfig('test-api-key', {
+      modelOptions: {
+        model: 'claude-3-7-sonnet',
+        topK: 10,
+        topP: 0.9,
+        thinking: false,
+      },
+    });
+
+    expect(result2.llmConfig).toHaveProperty('topK', 10);
+    expect(result2.llmConfig).toHaveProperty('topP', 0.9);
+
+    // Test with decimal notation as well
+    const result3 = getLLMConfig('test-api-key', {
+      modelOptions: {
+        model: 'claude-3.7-sonnet',
+        topK: 10,
+        topP: 0.9,
+        thinking: false,
+      },
+    });
+
+    expect(result3.llmConfig).toHaveProperty('topK', 10);
+    expect(result3.llmConfig).toHaveProperty('topP', 0.9);
+  });
 });
diff --git a/api/server/services/Endpoints/openAI/llm.js b/api/server/services/Endpoints/openAI/llm.js
@@ -29,7 +29,6 @@ function getLLMConfig(apiKey, options = {}) {
   const {
     modelOptions = {},
     reverseProxyUrl,
-    useOpenRouter,
     defaultQuery,
     headers,
     proxy,
@@ -56,9 +55,11 @@ function getLLMConfig(apiKey, options = {}) {
     });
   }
 
+  let useOpenRouter;
   /** @type {OpenAIClientOptions['configuration']} */
   const configOptions = {};
-  if (useOpenRouter || (reverseProxyUrl && reverseProxyUrl.includes(KnownEndpoints.openrouter))) {
+  if (reverseProxyUrl && reverseProxyUrl.includes(KnownEndpoints.openrouter)) {
+    useOpenRouter = true;
     llmConfig.include_reasoning = true;
     configOptions.baseURL = reverseProxyUrl;
     configOptions.defaultHeaders = Object.assign(
@@ -118,6 +119,13 @@ function getLLMConfig(apiKey, options = {}) {
     llmConfig.organization = process.env.OPENAI_ORGANIZATION;
   }
 
+  if (useOpenRouter && llmConfig.reasoning_effort != null) {
+    llmConfig.reasoning = {
+      effort: llmConfig.reasoning_effort,
+    };
+    delete llmConfig.reasoning_effort;
+  }
+
   return {
     /** @type {OpenAIClientOptions} */
     llmConfig,
diff --git a/client/src/components/Chat/Messages/Content/ContentParts.tsx b/client/src/components/Chat/Messages/Content/ContentParts.tsx
@@ -109,7 +109,9 @@ const ContentParts = memo(
                   return val;
                 })
               }
-              label={isSubmitting ? localize('com_ui_thinking') : localize('com_ui_thoughts')}
+              label={
+                isSubmitting && isLast ? localize('com_ui_thinking') : localize('com_ui_thoughts')
+              }
             />
           </div>
         )}
diff --git a/package-lock.json b/package-lock.json