Skip to content

Commit fe1a77b

Browse files
committed
documentation: [grok usecase] Image inputs and Captioning
1 parent 6190dbf commit fe1a77b

File tree

2 files changed

+343
-1
lines changed

2 files changed

+343
-1
lines changed
Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "0",
6+
"metadata": {},
7+
"source": [
8+
"# Grok Use Case: Image Inputs\n",
9+
"### This notebook demonstrates how to use Grok for analyzing and reasoning over image inputs, specifically focusing on software architecture diagrams."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"id": "1",
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"import base64\n",
20+
"import os\n",
21+
"import textwrap\n",
22+
"\n",
23+
"from dotenv import load_dotenv\n",
24+
"\n",
25+
"from autogen import LLMConfig, UserProxyAgent\n",
26+
"from autogen.agentchat import initiate_group_chat\n",
27+
"from autogen.agentchat.assistant_agent import AssistantAgent\n",
28+
"from autogen.agentchat.conversable_agent import ConversableAgent\n",
29+
"from autogen.agentchat.group import AgentNameTarget\n",
30+
"from autogen.agentchat.group.llm_condition import StringLLMCondition\n",
31+
"from autogen.agentchat.group.on_condition import OnCondition\n",
32+
"from autogen.agentchat.group.patterns.pattern import DefaultPattern\n",
33+
"\n",
34+
"load_dotenv()"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"id": "2",
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"# Initialize LLMConfig for Grok\n",
45+
"llm_config = LLMConfig(\n",
46+
" config_list=[\n",
47+
" {\n",
48+
" \"model\": \"grok-4\",\n",
49+
" \"api_type\": \"openai\", # Use existing openai type only\n",
50+
" \"base_url\": \"https://api.x.ai/v1\",\n",
51+
" \"api_key\": os.getenv(\"XAI_API_KEY\"),\n",
52+
" \"max_tokens\": 1000,\n",
53+
" }\n",
54+
" ],\n",
55+
" temperature=0.5,\n",
56+
")\n",
57+
"image_config = LLMConfig(\n",
58+
" api_type=\"responses\", model=\"grok-4\", api_key=os.getenv(\"XAI_API_KEY\"), built_in_tools=[\"image_generation\"]\n",
59+
")"
60+
]
61+
},
62+
{
63+
"cell_type": "markdown",
64+
"id": "3",
65+
"metadata": {},
66+
"source": [
67+
"## The Example Demonsrates image generation and captioning capabilities of grok 4 with following architecture."
68+
]
69+
},
70+
{
71+
"cell_type": "markdown",
72+
"id": "4",
73+
"metadata": {},
74+
"source": [
75+
"1. **Image Generation:** Highly detailed Image Generation.\n",
76+
"2. **Image Captioning:** Precise Image OCR capabilities."
77+
]
78+
},
79+
{
80+
"cell_type": "markdown",
81+
"id": "5",
82+
"metadata": {},
83+
"source": [
84+
"### Solution Architect Agent architecture"
85+
]
86+
},
87+
{
88+
"cell_type": "markdown",
89+
"id": "6",
90+
"metadata": {},
91+
"source": [
92+
"1. Analyst agent (OCR on Image)\n",
93+
"2. Solution Architect (Enhance existing architecture)\n",
94+
"3. User Agent \n",
95+
"4. Design Agent (for Generating and performing analysis on image)"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": null,
101+
"id": "7",
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"with llm_config:\n",
106+
" analyst = AssistantAgent(\n",
107+
" name=\"analyst\",\n",
108+
" system_message=textwrap.dedent(\"\"\"\n",
109+
" You are an Analyst agent that can reason over images.\n",
110+
" You will be provided with an image and you will need to analyze it.\n",
111+
" the image will most probably an image of a software architecture.\n",
112+
" You will need to analyze the image and provide a detailed analysis of the software architecture.\n",
113+
" \"\"\").strip(),\n",
114+
" )\n",
115+
"\n",
116+
" solution_architect = ConversableAgent(\n",
117+
" name=\"solution_architect\",\n",
118+
" system_message=textwrap.dedent(\"\"\"\n",
119+
" You are a solution architect that can reason over descriptions of an software architecture.\n",
120+
" You will be provided with a description of a software architecture and you will need to analyze it.\n",
121+
" You will need to analyze the description and provide and propose a new software architecture with enhancements.\n",
122+
" the new architecture should be more efficient, secure, and scalable.\n",
123+
" the new architecture should include the following components:\n",
124+
" 1) IMPORTANT: only provide the FLOW of new Architecture components from start to end.\n",
125+
" 2) IMPORTANT: flow should be concise and to the point. as a graph with description of each node and connection.\n",
126+
" 3) exit once image is generated.\n",
127+
" \"\"\").strip(),\n",
128+
" max_consecutive_auto_reply=1,\n",
129+
" )\n",
130+
"\n",
131+
" user_agent = UserProxyAgent(\n",
132+
" name=\"user\",\n",
133+
" human_input_mode=\"ALWAYS\",\n",
134+
" )\n",
135+
"\n",
136+
"design_agent = AssistantAgent(\n",
137+
" name=\"design_agent\",\n",
138+
" llm_config=llm_config,\n",
139+
" system_message=textwrap.dedent(\"\"\"\n",
140+
" generate images for software architecture.\n",
141+
" the image should be a flow of the software architecture.\n",
142+
" the image should be in a format that can be used to generate a software architecture.\n",
143+
" # if solution architect returns a new software architecture flow, you should generate an image for the new software architecture flow.\n",
144+
" \"\"\").strip(),\n",
145+
" max_consecutive_auto_reply=1,\n",
146+
")"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": null,
152+
"id": "8",
153+
"metadata": {},
154+
"outputs": [],
155+
"source": [
156+
"# ----helper function to save image from base64 string----\n",
157+
"def save_b64_png(b64_str, fname=\"generated.png\"):\n",
158+
" with open(fname, \"wb\") as f:\n",
159+
" f.write(base64.b64decode(b64_str))\n",
160+
" print(f\"image saved → {fname}\")\n",
161+
"\n",
162+
"\n",
163+
"def save_artbot_images_from_response(response):\n",
164+
" messages = response.messages\n",
165+
" for i in range(len(messages)):\n",
166+
" print(i)\n",
167+
" message = messages[i]\n",
168+
" if message.get(\"name\") == \"design_agent\":\n",
169+
" contents = message.get(\"content\", [])\n",
170+
" for content in contents:\n",
171+
" if (\n",
172+
" content.get(\"type\") == \"tool_call\"\n",
173+
" and content.get(\"name\") == \"image_generation\"\n",
174+
" and \"content\" in content\n",
175+
" and content[\"content\"]\n",
176+
" ):\n",
177+
" print(\"Saving image!\")\n",
178+
" save_b64_png(content[\"content\"], f\"image{i}.png\")"
179+
]
180+
},
181+
{
182+
"cell_type": "markdown",
183+
"id": "9",
184+
"metadata": {},
185+
"source": [
186+
"### Define tools for agent and tool description\n",
187+
"1. To Get Image Descriptions\n",
188+
"2. To Generate Image"
189+
]
190+
},
191+
{
192+
"cell_type": "code",
193+
"execution_count": null,
194+
"id": "10",
195+
"metadata": {},
196+
"outputs": [],
197+
"source": [
198+
"decription_tool_prompt = \"\"\"\n",
199+
"This tool is used to get the description of the architecture image.\n",
200+
"Input Args:\n",
201+
"- image_url: str (url of the architecture image)\n",
202+
"\"\"\"\n",
203+
"\n",
204+
"\n",
205+
"@analyst.register_for_llm(description=decription_tool_prompt)\n",
206+
"@user_agent.register_for_execution(description=decription_tool_prompt)\n",
207+
"async def get_image_description(image_url: str):\n",
208+
" prompt = f\"\"\"\n",
209+
" Given the following architecture image: {image_url}\n",
210+
" Return a short and concise description of the image.\n",
211+
" Then, provide the flow of the architecture in clear, numbered or bulleted points.\n",
212+
" Format:\n",
213+
" Description: <one paragraph understanding the architecture>\n",
214+
" Flow:\n",
215+
" 1. <first step/component>(description)\n",
216+
" 2. <second step/component>(description)\n",
217+
" ...\n",
218+
" Only include the essential components and their order in the flow.\n",
219+
" \"\"\"\n",
220+
" chat = {\n",
221+
" \"role\": \"user\",\n",
222+
" \"content\": [\n",
223+
" {\n",
224+
" \"type\": \"input_text\",\n",
225+
" \"text\": textwrap.dedent(f\"\"\"\n",
226+
" {prompt}\n",
227+
" \"\"\").strip(),\n",
228+
" },\n",
229+
" {\"type\": \"image_url\", \"image_url\": {\"url\": image_url, \"detail\": \"high\"}},\n",
230+
" ],\n",
231+
" }\n",
232+
" design_agent.run(message=chat, user_input=False, max_rounds=1).process()\n",
233+
" last_message = design_agent.last_message()\n",
234+
" return last_message[\"content\"]\n",
235+
"\n",
236+
"\n",
237+
"tool_prompt = \"\"\"\n",
238+
"This tool is used to generate an architecture flowchart image for the provided software architecture flow.\n",
239+
"Input Args:\n",
240+
"- architecture_flow: str (detail flow of the software architecture in numbered or bulleted points)\n",
241+
"\"\"\"\n",
242+
"\n",
243+
"\n",
244+
"@solution_architect.register_for_llm(description=tool_prompt)\n",
245+
"@user_agent.register_for_execution(description=tool_prompt)\n",
246+
"async def design_architecture(architecture_flow: str):\n",
247+
" response = design_agent.run(\n",
248+
" message=f\"generate an architecture flowchart image for the following software architecture flow: {architecture_flow}\",\n",
249+
" chat_history=True,\n",
250+
" user_input=False,\n",
251+
" max_turns=1,\n",
252+
" ).process()\n",
253+
"\n",
254+
" last_message = design_agent.last_message()\n",
255+
" save_artbot_images_from_response(response)\n",
256+
" return last_message[\"content\"][-1]"
257+
]
258+
},
259+
{
260+
"cell_type": "markdown",
261+
"id": "11",
262+
"metadata": {},
263+
"source": [
264+
"### DefaultPattern utilizing an LLM-based handoff condition"
265+
]
266+
},
267+
{
268+
"cell_type": "code",
269+
"execution_count": null,
270+
"id": "12",
271+
"metadata": {},
272+
"outputs": [],
273+
"source": [
274+
"default_pattern = DefaultPattern(\n",
275+
" initial_agent=analyst,\n",
276+
" agents=[analyst, solution_architect],\n",
277+
" user_agent=user_agent,\n",
278+
" group_manager_args={\"llm_config\": llm_config},\n",
279+
")\n",
280+
"\n",
281+
"analyst.handoffs.add_llm_conditions([\n",
282+
" OnCondition(\n",
283+
" target=AgentNameTarget(\"solution_architect\"),\n",
284+
" condition=StringLLMCondition(prompt=\"When Analyst agent returns Description/Analysis of an Architecture Image\"),\n",
285+
" ),\n",
286+
"])"
287+
]
288+
},
289+
{
290+
"cell_type": "code",
291+
"execution_count": null,
292+
"id": "13",
293+
"metadata": {},
294+
"outputs": [],
295+
"source": [
296+
"IMAGE_URL = \"https://user-images.githubusercontent.com/65826354/179526761-7f473e3d-f71c-429d-bf49-16958c5cb7a6.png\"\n",
297+
"default_paresult, context, last_agent = initiate_group_chat(\n",
298+
" pattern=default_pattern,\n",
299+
" messages=f\"Describe this image {IMAGE_URL} provide a detailed analysis of the software architecture.\",\n",
300+
" max_rounds=20,\n",
301+
")"
302+
]
303+
}
304+
],
305+
"metadata": {
306+
"front_matter": {
307+
"description": "Using MathChat to Solve Math Problems",
308+
"tags": [
309+
"grok"
310+
]
311+
},
312+
"kernelspec": {
313+
"display_name": "Python 3",
314+
"language": "python",
315+
"name": "python3"
316+
},
317+
"language_info": {
318+
"codemirror_mode": {
319+
"name": "ipython",
320+
"version": 3
321+
},
322+
"file_extension": ".py",
323+
"mimetype": "text/x-python",
324+
"name": "python",
325+
"nbconvert_exporter": "python",
326+
"pygments_lexer": "ipython3",
327+
"version": "3.13.5"
328+
}
329+
},
330+
"nbformat": 4,
331+
"nbformat_minor": 5
332+
}

test/test_notebook.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,20 @@ def test_agentchat_groupchat_stateflow(save=False):
150150
reason="do not run if py!=3.13",
151151
)
152152
@run_for_optional_imports(["openai"], "openai")
153-
def test_agentchat_grok_sbom_analysisw(save=False):
153+
def test_agentchat_grok_sbom_analysis(save=False):
154154
run_notebook("agentchat_grok_sbom_analysis.ipynb", save=save)
155155

156156

157+
@run_for_optional_imports("openai", "openai")
158+
@pytest.mark.skipif(
159+
not sys.version.startswith("3.13"),
160+
reason="do not run if py!=3.13",
161+
)
162+
@run_for_optional_imports(["openai"], "openai")
163+
def test_agentchat_grok_usecase_image_inputs(save=False):
164+
run_notebook("agentchat_grok_usecase_image_inputs.ipynb", save=save)
165+
166+
157167
if __name__ == "__main__":
158168
# test_agentchat_auto_feedback_from_code(save=True)
159169
# test_oai_chatgpt_gpt4(save=True)

0 commit comments

Comments
 (0)