danny-avila
diff --git a/‎api/app/clients/OpenAIClient.js
Lines changed: 5 additions & 1 deletion b/‎api/app/clients/OpenAIClient.js
Lines changed: 5 additions & 1 deletion
diff --git a/‎api/app/clients/prompts/createVisionPrompt.js
Lines changed: 34 additions & 0 deletions b/‎api/app/clients/prompts/createVisionPrompt.js
Lines changed: 34 additions & 0 deletions
diff --git a/‎api/app/clients/prompts/index.js
Lines changed: 2 additions & 0 deletions b/‎api/app/clients/prompts/index.js
Lines changed: 2 additions & 0 deletions
diff --git a/‎api/server/routes/assistants/chat.js
Lines changed: 159 additions & 74 deletions b/‎api/server/routes/assistants/chat.js
Lines changed: 159 additions & 74 deletions
diff --git a/‎api/server/services/Runs/StreamRunManager.js
Lines changed: 4 additions & 0 deletions b/‎api/server/services/Runs/StreamRunManager.js
Lines changed: 4 additions & 0 deletions
@@ -92,7 +92,11 @@ class OpenAIClient extends BaseClient {
     }
 
     this.defaultVisionModel = this.options.visionModel ?? 'gpt-4-vision-preview';
-    this.options.attachments?.then((attachments) => this.checkVisionRequest(attachments));
+    if (typeof this.options.attachments?.then === 'function') {
+      this.options.attachments.then((attachments) => this.checkVisionRequest(attachments));
+    } else {
+      this.checkVisionRequest(this.options.attachments);
+    }
 
     const { OPENROUTER_API_KEY, OPENAI_FORCE_PROMPT } = process.env ?? {};
     if (OPENROUTER_API_KEY && !this.azure) {
 
@@ -0,0 +1,34 @@
+/**
+ * Generates a prompt instructing the user to describe an image in detail, tailored to different types of visual content.
+ * @param {boolean} pluralized - Whether to pluralize the prompt for multiple images.
+ * @returns {string} - The generated vision prompt.
+ */
+const createVisionPrompt = (pluralized = false) => {
+  return `Please describe the image${
+    pluralized ? 's' : ''
+  } in detail, covering relevant aspects such as:
+
+  For photographs, illustrations, or artwork:
+  - The main subject(s) and their appearance, positioning, and actions
+  - The setting, background, and any notable objects or elements
+  - Colors, lighting, and overall mood or atmosphere
+  - Any interesting details, textures, or patterns
+  - The style, technique, or medium used (if discernible)
+  
+  For screenshots or images containing text:
+  - The content and purpose of the text
+  - The layout, formatting, and organization of the information
+  - Any notable visual elements, such as logos, icons, or graphics
+  - The overall context or message conveyed by the screenshot
+  
+  For graphs, charts, or data visualizations:
+  - The type of graph or chart (e.g., bar graph, line chart, pie chart)
+  - The variables being compared or analyzed
+  - Any trends, patterns, or outliers in the data
+  - The axis labels, scales, and units of measurement
+  - The title, legend, and any additional context provided
+  
+  Be as specific and descriptive as possible while maintaining clarity and concision.`;
+};
+
+module.exports = createVisionPrompt;
@@ -4,6 +4,7 @@ const handleInputs = require('./handleInputs');
 const instructions = require('./instructions');
 const titlePrompts = require('./titlePrompts');
 const truncateText = require('./truncateText');
+const createVisionPrompt = require('./createVisionPrompt');
 const createContextHandlers = require('./createContextHandlers');
 
 module.exports = {
@@ -13,5 +14,6 @@ module.exports = {
   ...instructions,
   ...titlePrompts,
   truncateText,
+  createVisionPrompt,
   createContextHandlers,
 };
@@ -4,9 +4,11 @@ const {
   Constants,
   RunStatus,
   CacheKeys,
+  FileSources,
   ContentTypes,
   EModelEndpoint,
   ViolationTypes,
+  ImageVisionTool,
   AssistantStreamEvents,
 } = require('librechat-data-provider');
 const {
@@ -17,9 +19,10 @@ const {
   addThreadMetadata,
   saveAssistantMessage,
 } = require('~/server/services/Threads');
+const { sendResponse, sendMessage, sleep, isEnabled, countTokens } = require('~/server/utils');
 const { runAssistant, createOnTextProgress } = require('~/server/services/AssistantService');
 const { addTitle, initializeClient } = require('~/server/services/Endpoints/assistants');
-const { sendResponse, sendMessage, sleep, isEnabled, countTokens } = require('~/server/utils');
+const { formatMessage, createVisionPrompt } = require('~/app/clients/prompts');
 const { createRun, StreamRunManager } = require('~/server/services/Runs');
 const { getTransactions } = require('~/models/Transaction');
 const checkBalance = require('~/models/checkBalance');
@@ -100,6 +103,16 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
   let parentMessageId = _parentId;
   /** @type {TMessage[]} */
   let previousMessages = [];
+  /** @type {import('librechat-data-provider').TConversation | null} */
+  let conversation = null;
+  /** @type {string[]} */
+  let file_ids = [];
+  /** @type {Set<string>} */
+  let attachedFileIds = new Set();
+  /** @type {TMessage | null} */
+  let requestMessage = null;
+  /** @type {undefined | Promise<ChatCompletion>} */
+  let visionPromise;
 
   const userMessageId = v4();
   const responseMessageId = v4();
@@ -258,7 +271,10 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
       throw new Error('Missing assistant_id');
     }
 
-    if (isEnabled(process.env.CHECK_BALANCE)) {
+    const checkBalanceBeforeRun = async () => {
+      if (!isEnabled(process.env.CHECK_BALANCE)) {
+        return;
+      }
       const transactions =
         (await getTransactions({
           user: req.user.id,
@@ -288,7 +304,7 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
           amount: promptTokens,
         },
       });
-    }
+    };
 
     /** @type {{ openai: OpenAIClient }} */
     const { openai: _openai, client } = await initializeClient({
@@ -300,103 +316,168 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
 
     openai = _openai;
 
-    // if (thread_id) {
-    //   previousMessages = await checkMessageGaps({ openai, thread_id, conversationId });
-    // }
-
     if (previousMessages.length) {
       parentMessageId = previousMessages[previousMessages.length - 1].messageId;
     }
 
-    const userMessage = {
+    let userMessage = {
       role: 'user',
       content: text,
       metadata: {
         messageId: userMessageId,
       },
     };
 
-    let thread_file_ids = [];
-    if (convoId) {
-      const convo = await getConvo(req.user.id, convoId);
-      if (convo && convo.file_ids) {
-        thread_file_ids = convo.file_ids;
-      }
+    /** @type {CreateRunBody | undefined} */
+    const body = {
+      assistant_id,
+      model,
+    };
+
+    if (promptPrefix) {
+      body.additional_instructions = promptPrefix;
     }
 
-    const file_ids = files.map(({ file_id }) => file_id);
-    if (file_ids.length || thread_file_ids.length) {
-      userMessage.file_ids = file_ids;
-      openai.attachedFileIds = new Set([...file_ids, ...thread_file_ids]);
+    if (instructions) {
+      body.instructions = instructions;
     }
 
-    // TODO: may allow multiple messages to be created beforehand in a future update
-    const initThreadBody = {
-      messages: [userMessage],
-      metadata: {
-        user: req.user.id,
-        conversationId,
-      },
+    const getRequestFileIds = async () => {
+      let thread_file_ids = [];
+      if (convoId) {
+        const convo = await getConvo(req.user.id, convoId);
+        if (convo && convo.file_ids) {
+          thread_file_ids = convo.file_ids;
+        }
+      }
+
+      file_ids = files.map(({ file_id }) => file_id);
+      if (file_ids.length || thread_file_ids.length) {
+        userMessage.file_ids = file_ids;
+        attachedFileIds = new Set([...file_ids, ...thread_file_ids]);
+      }
     };
 
-    const result = await initThread({ openai, body: initThreadBody, thread_id });
-    thread_id = result.thread_id;
+    const addVisionPrompt = async () => {
+      if (!req.body.endpointOption.attachments) {
+        return;
+      }
 
-    createOnTextProgress({
-      openai,
-      conversationId,
-      userMessageId,
-      messageId: responseMessageId,
-      thread_id,
-    });
+      const assistant = await openai.beta.assistants.retrieve(assistant_id);
+      const visionToolIndex = assistant.tools.findIndex(
+        (tool) => tool.function.name === ImageVisionTool.function.name,
+      );
 
-    const requestMessage = {
-      user: req.user.id,
-      text,
-      messageId: userMessageId,
-      parentMessageId,
-      // TODO: make sure client sends correct format for `files`, use zod
-      files,
-      file_ids,
-      conversationId,
-      isCreatedByUser: true,
-      assistant_id,
-      thread_id,
-      model: assistant_id,
-    };
+      if (visionToolIndex === -1) {
+        return;
+      }
 
-    previousMessages.push(requestMessage);
+      const attachments = await req.body.endpointOption.attachments;
+      let visionMessage = {
+        role: 'user',
+        content: '',
+      };
+      const files = await client.addImageURLs(visionMessage, attachments);
+      if (!visionMessage.image_urls?.length) {
+        return;
+      }
 
-    await saveUserMessage({ ...requestMessage, model });
+      const imageCount = visionMessage.image_urls.length;
+      const plural = imageCount > 1;
+      visionMessage.content = createVisionPrompt(plural);
+      visionMessage = formatMessage({ message: visionMessage, endpoint: EModelEndpoint.openAI });
 
-    const conversation = {
-      conversationId,
-      // TODO: title feature
-      title: 'New Chat',
-      endpoint: EModelEndpoint.assistants,
-      promptPrefix: promptPrefix,
-      instructions: instructions,
-      assistant_id,
-      // model,
-    };
+      visionPromise = openai.chat.completions.create({
+        model: 'gpt-4-vision-preview',
+        messages: [visionMessage],
+        max_tokens: 4000,
+      });
 
-    if (file_ids.length) {
-      conversation.file_ids = file_ids;
-    }
+      const pluralized = plural ? 's' : '';
+      body.additional_instructions = `${
+        body.additional_instructions ? `${body.additional_instructions}\n` : ''
+      }The user has uploaded ${imageCount} image${pluralized}.
+      Use the \`${ImageVisionTool.function.name}\` tool to retrieve ${
+  plural ? '' : 'a '
+}detailed text description${pluralized} for ${plural ? 'each' : 'the'} image${pluralized}.`;
 
-    /** @type {CreateRunBody} */
-    const body = {
-      assistant_id,
-      model,
+      return files;
     };
 
-    if (promptPrefix) {
-      body.additional_instructions = promptPrefix;
-    }
+    const initializeThread = async () => {
+      /** @type {[ undefined | MongoFile[]]}*/
+      const [processedFiles] = await Promise.all([addVisionPrompt(), getRequestFileIds()]);
+      // TODO: may allow multiple messages to be created beforehand in a future update
+      const initThreadBody = {
+        messages: [userMessage],
+        metadata: {
+          user: req.user.id,
+          conversationId,
+        },
+      };
 
-    if (instructions) {
-      body.instructions = instructions;
-    }
+      if (processedFiles) {
+        for (const file of processedFiles) {
+          if (file.source !== FileSources.openai) {
+            attachedFileIds.delete(file.file_id);
+            const index = file_ids.indexOf(file.file_id);
+            if (index > -1) {
+              file_ids.splice(index, 1);
+            }
+          }
+        }
+
+        userMessage.file_ids = file_ids;
+      }
+
+      const result = await initThread({ openai, body: initThreadBody, thread_id });
+      thread_id = result.thread_id;
+
+      createOnTextProgress({
+        openai,
+        conversationId,
+        userMessageId,
+        messageId: responseMessageId,
+        thread_id,
+      });
+
+      requestMessage = {
+        user: req.user.id,
+        text,
+        messageId: userMessageId,
+        parentMessageId,
+        // TODO: make sure client sends correct format for `files`, use zod
+        files,
+        file_ids,
+        conversationId,
+        isCreatedByUser: true,
+        assistant_id,
+        thread_id,
+        model: assistant_id,
+      };
+
+      previousMessages.push(requestMessage);
+
+      /* asynchronous */
+      saveUserMessage({ ...requestMessage, model });
+
+      conversation = {
+        conversationId,
+        title: 'New Chat',
+        endpoint: EModelEndpoint.assistants,
+        promptPrefix: promptPrefix,
+        instructions: instructions,
+        assistant_id,
+        // model,
+      };
+
+      if (file_ids.length) {
+        conversation.file_ids = file_ids;
+      }
+    };
+
+    const promises = [initializeThread(), checkBalanceBeforeRun()];
+    await Promise.all(promises);
 
     const sendInitialResponse = () => {
       sendMessage(res, {
@@ -421,6 +502,8 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
 
     const processRun = async (retry = false) => {
       if (req.app.locals[EModelEndpoint.azureOpenAI]?.assistants) {
+        openai.attachedFileIds = attachedFileIds;
+        openai.visionPromise = visionPromise;
         if (retry) {
           response = await runAssistant({
             openai,
@@ -463,9 +546,11 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
         req,
         res,
         openai,
+        handlers,
         thread_id,
+        visionPromise,
+        attachedFileIds,
         responseMessage: openai.responseMessage,
-        handlers,
         // streamOptions: {
 
         // },
 
@@ -59,6 +59,10 @@ class StreamRunManager {
     this.messages = [];
     /** @type {string} */
     this.text = '';
+    /** @type {Set<string>} */
+    this.attachedFileIds = fields.attachedFileIds;
+    /** @type {undefined | Promise<ChatCompletion>} */
+    this.visionPromise = fields.visionPromise;
 
     /**
      * @type {Object.<AssistantStreamEvents, (event: AssistantStreamEvent) => Promise<void>>}