documentation: [grok usecase] Image inputs and Captioning

priyansh4320 · priyansh4320 · commit fe1a77b15319 · 2025-08-01T19:43:48.000+05:30
diff --git a/notebook/agentchat_grok_usecase_image_inputs.ipynb b/notebook/agentchat_grok_usecase_image_inputs.ipynb
@@ -0,0 +1,332 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Grok Use Case: Image Inputs\n",
+    "### This notebook demonstrates how to use Grok for analyzing and reasoning over image inputs, specifically focusing on software architecture diagrams."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import os\n",
+    "import textwrap\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from autogen import LLMConfig, UserProxyAgent\n",
+    "from autogen.agentchat import initiate_group_chat\n",
+    "from autogen.agentchat.assistant_agent import AssistantAgent\n",
+    "from autogen.agentchat.conversable_agent import ConversableAgent\n",
+    "from autogen.agentchat.group import AgentNameTarget\n",
+    "from autogen.agentchat.group.llm_condition import StringLLMCondition\n",
+    "from autogen.agentchat.group.on_condition import OnCondition\n",
+    "from autogen.agentchat.group.patterns.pattern import DefaultPattern\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize LLMConfig for Grok\n",
+    "llm_config = LLMConfig(\n",
+    "    config_list=[\n",
+    "        {\n",
+    "            \"model\": \"grok-4\",\n",
+    "            \"api_type\": \"openai\",  # Use existing openai type only\n",
+    "            \"base_url\": \"https://api.x.ai/v1\",\n",
+    "            \"api_key\": os.getenv(\"XAI_API_KEY\"),\n",
+    "            \"max_tokens\": 1000,\n",
+    "        }\n",
+    "    ],\n",
+    "    temperature=0.5,\n",
+    ")\n",
+    "image_config = LLMConfig(\n",
+    "    api_type=\"responses\", model=\"grok-4\", api_key=os.getenv(\"XAI_API_KEY\"), built_in_tools=[\"image_generation\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "## The Example Demonsrates image generation and captioning capabilities of grok 4 with following architecture."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "1. **Image Generation:** Highly detailed Image Generation.\n",
+    "2. **Image Captioning:** Precise Image OCR capabilities."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "### Solution Architect Agent architecture"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6",
+   "metadata": {},
+   "source": [
+    "1. Analyst agent (OCR on Image)\n",
+    "2. Solution Architect (Enhance existing architecture)\n",
+    "3. User Agent \n",
+    "4. Design Agent (for Generating and performing analysis on image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with llm_config:\n",
+    "    analyst = AssistantAgent(\n",
+    "        name=\"analyst\",\n",
+    "        system_message=textwrap.dedent(\"\"\"\n",
+    "        You are an Analyst agent that can reason over images.\n",
+    "        You will be provided with an image and you will need to analyze it.\n",
+    "        the image will most probably an image of a software architecture.\n",
+    "        You will need to analyze the image and provide a detailed analysis of the software architecture.\n",
+    "    \"\"\").strip(),\n",
+    "    )\n",
+    "\n",
+    "    solution_architect = ConversableAgent(\n",
+    "        name=\"solution_architect\",\n",
+    "        system_message=textwrap.dedent(\"\"\"\n",
+    "        You are a solution architect that can reason over descriptions of an software architecture.\n",
+    "        You will be provided with a description of a software architecture and you will need to analyze it.\n",
+    "        You will need to analyze the description and provide and propose a new software architecture with enhancements.\n",
+    "        the new architecture should be more efficient, secure, and scalable.\n",
+    "        the new architecture should include the following components:\n",
+    "        1) IMPORTANT: only provide the FLOW of new Architecture components from start to end.\n",
+    "        2) IMPORTANT: flow should be concise and to the point. as a graph with description of each node and connection.\n",
+    "        3) exit once image is generated.\n",
+    "    \"\"\").strip(),\n",
+    "        max_consecutive_auto_reply=1,\n",
+    "    )\n",
+    "\n",
+    "    user_agent = UserProxyAgent(\n",
+    "        name=\"user\",\n",
+    "        human_input_mode=\"ALWAYS\",\n",
+    "    )\n",
+    "\n",
+    "design_agent = AssistantAgent(\n",
+    "    name=\"design_agent\",\n",
+    "    llm_config=llm_config,\n",
+    "    system_message=textwrap.dedent(\"\"\"\n",
+    "    generate images for software architecture.\n",
+    "    the image should be a flow of the software architecture.\n",
+    "    the image should be in a format that can be used to generate a software architecture.\n",
+    "    # if solution architect returns a new software architecture flow, you should generate an image for the new software architecture flow.\n",
+    "    \"\"\").strip(),\n",
+    "    max_consecutive_auto_reply=1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ----helper function to save image from base64 string----\n",
+    "def save_b64_png(b64_str, fname=\"generated.png\"):\n",
+    "    with open(fname, \"wb\") as f:\n",
+    "        f.write(base64.b64decode(b64_str))\n",
+    "    print(f\"image saved → {fname}\")\n",
+    "\n",
+    "\n",
+    "def save_artbot_images_from_response(response):\n",
+    "    messages = response.messages\n",
+    "    for i in range(len(messages)):\n",
+    "        print(i)\n",
+    "        message = messages[i]\n",
+    "        if message.get(\"name\") == \"design_agent\":\n",
+    "            contents = message.get(\"content\", [])\n",
+    "            for content in contents:\n",
+    "                if (\n",
+    "                    content.get(\"type\") == \"tool_call\"\n",
+    "                    and content.get(\"name\") == \"image_generation\"\n",
+    "                    and \"content\" in content\n",
+    "                    and content[\"content\"]\n",
+    "                ):\n",
+    "                    print(\"Saving image!\")\n",
+    "                    save_b64_png(content[\"content\"], f\"image{i}.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9",
+   "metadata": {},
+   "source": [
+    "### Define tools for agent and tool description\n",
+    "1. To Get Image Descriptions\n",
+    "2. To Generate Image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decription_tool_prompt = \"\"\"\n",
+    "This tool is used to get the description of the architecture image.\n",
+    "Input Args:\n",
+    "- image_url: str (url of the architecture image)\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "@analyst.register_for_llm(description=decription_tool_prompt)\n",
+    "@user_agent.register_for_execution(description=decription_tool_prompt)\n",
+    "async def get_image_description(image_url: str):\n",
+    "    prompt = f\"\"\"\n",
+    "    Given the following architecture image: {image_url}\n",
+    "    Return a short and concise description of the image.\n",
+    "    Then, provide the flow of the architecture in clear, numbered or bulleted points.\n",
+    "    Format:\n",
+    "    Description: <one paragraph understanding the architecture>\n",
+    "    Flow:\n",
+    "    1. <first step/component>(description)\n",
+    "    2. <second step/component>(description)\n",
+    "    ...\n",
+    "    Only include the essential components and their order in the flow.\n",
+    "    \"\"\"\n",
+    "    chat = {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\n",
+    "                \"type\": \"input_text\",\n",
+    "                \"text\": textwrap.dedent(f\"\"\"\n",
+    "        {prompt}\n",
+    "    \"\"\").strip(),\n",
+    "            },\n",
+    "            {\"type\": \"image_url\", \"image_url\": {\"url\": image_url, \"detail\": \"high\"}},\n",
+    "        ],\n",
+    "    }\n",
+    "    design_agent.run(message=chat, user_input=False, max_rounds=1).process()\n",
+    "    last_message = design_agent.last_message()\n",
+    "    return last_message[\"content\"]\n",
+    "\n",
+    "\n",
+    "tool_prompt = \"\"\"\n",
+    "This tool is used to generate an architecture flowchart image for the provided software architecture flow.\n",
+    "Input Args:\n",
+    "- architecture_flow: str (detail flow of the software architecture in numbered or bulleted points)\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "@solution_architect.register_for_llm(description=tool_prompt)\n",
+    "@user_agent.register_for_execution(description=tool_prompt)\n",
+    "async def design_architecture(architecture_flow: str):\n",
+    "    response = design_agent.run(\n",
+    "        message=f\"generate an architecture flowchart image for the following software architecture flow: {architecture_flow}\",\n",
+    "        chat_history=True,\n",
+    "        user_input=False,\n",
+    "        max_turns=1,\n",
+    "    ).process()\n",
+    "\n",
+    "    last_message = design_agent.last_message()\n",
+    "    save_artbot_images_from_response(response)\n",
+    "    return last_message[\"content\"][-1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11",
+   "metadata": {},
+   "source": [
+    "### DefaultPattern utilizing an LLM-based handoff condition"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "default_pattern = DefaultPattern(\n",
+    "    initial_agent=analyst,\n",
+    "    agents=[analyst, solution_architect],\n",
+    "    user_agent=user_agent,\n",
+    "    group_manager_args={\"llm_config\": llm_config},\n",
+    ")\n",
+    "\n",
+    "analyst.handoffs.add_llm_conditions([\n",
+    "    OnCondition(\n",
+    "        target=AgentNameTarget(\"solution_architect\"),\n",
+    "        condition=StringLLMCondition(prompt=\"When Analyst agent returns Description/Analysis of an Architecture Image\"),\n",
+    "    ),\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "IMAGE_URL = \"https://user-images.githubusercontent.com/65826354/179526761-7f473e3d-f71c-429d-bf49-16958c5cb7a6.png\"\n",
+    "default_paresult, context, last_agent = initiate_group_chat(\n",
+    "    pattern=default_pattern,\n",
+    "    messages=f\"Describe this image {IMAGE_URL} provide a detailed analysis of the software architecture.\",\n",
+    "    max_rounds=20,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "front_matter": {
+   "description": "Using MathChat to Solve Math Problems",
+   "tags": [
+    "grok"
+   ]
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test/test_notebook.py b/test/test_notebook.py
@@ -150,10 +150,20 @@ def test_agentchat_groupchat_stateflow(save=False):
     reason="do not run if py!=3.13",
 )
 @run_for_optional_imports(["openai"], "openai")
-def test_agentchat_grok_sbom_analysisw(save=False):
+def test_agentchat_grok_sbom_analysis(save=False):
     run_notebook("agentchat_grok_sbom_analysis.ipynb", save=save)
 
 
+@run_for_optional_imports("openai", "openai")
+@pytest.mark.skipif(
+    not sys.version.startswith("3.13"),
+    reason="do not run if py!=3.13",
+)
+@run_for_optional_imports(["openai"], "openai")
+def test_agentchat_grok_usecase_image_inputs(save=False):
+    run_notebook("agentchat_grok_usecase_image_inputs.ipynb", save=save)
+
+
 if __name__ == "__main__":
     # test_agentchat_auto_feedback_from_code(save=True)
     # test_oai_chatgpt_gpt4(save=True)