Skip to content

Commit 2df5cef

Browse files
committed
feat: add comprehensive HTML DOM extraction documentation for execute_browser_script
- Added Browser DOM Inspection section to MCP agent README - Enhanced execute_browser_script tool description with DOM extraction details - Added DOM retrieval examples to prompt.rs for AI agents - Created browser_dom_extraction.yml with 8 comprehensive extraction patterns - Enhanced test_remote_workflow_with_browser_script.yml with DOM examples - Documented size limits and truncation strategies for large DOMs - Clarified when to use DOM vs Accessibility Tree approaches
1 parent d496d63 commit 2df5cef

File tree

5 files changed

+542
-5
lines changed

5 files changed

+542
-5
lines changed
Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
---
2+
# Browser DOM Extraction Examples
3+
# This workflow demonstrates various patterns for extracting HTML DOM data using execute_browser_script
4+
tool_name: execute_sequence
5+
arguments:
6+
variables:
7+
test_url:
8+
type: string
9+
label: "URL to extract DOM from"
10+
default: "https://example.com"
11+
12+
inputs:
13+
test_url: "https://example.com"
14+
15+
steps:
16+
# Step 1: Navigate to the target page
17+
- tool_name: navigate_browser
18+
arguments:
19+
url: "${{inputs.test_url}}"
20+
browser: "chrome"
21+
delay_ms: 3000
22+
continue_on_error: false
23+
24+
# Step 2: Get full HTML DOM (simple extraction)
25+
- tool_name: execute_browser_script
26+
arguments:
27+
selector: "role:Window|name:Chrome"
28+
script: |
29+
// Simple full DOM extraction
30+
document.documentElement.outerHTML
31+
delay_ms: 500
32+
continue_on_error: true
33+
step_id: full_dom_extraction
34+
35+
# Step 3: Get structured page information with size management
36+
- tool_name: execute_browser_script
37+
arguments:
38+
selector: "role:Window|name:Chrome"
39+
script: |
40+
// Structured extraction with truncation for large DOMs
41+
const html = document.documentElement.outerHTML;
42+
const maxLength = 30000; // MCP response size limit
43+
44+
({
45+
// Basic page info
46+
url: window.location.href,
47+
title: document.title,
48+
description: document.querySelector('meta[name="description"]')?.content || '',
49+
50+
// HTML with size management
51+
html: html.length > maxLength
52+
? html.substring(0, maxLength) + '... [truncated]'
53+
: html,
54+
htmlLength: html.length,
55+
wasTruncated: html.length > maxLength,
56+
57+
// Text content preview
58+
bodyText: document.body.innerText.substring(0, 500),
59+
60+
// Page metrics
61+
timestamp: new Date().toISOString()
62+
})
63+
delay_ms: 500
64+
continue_on_error: false
65+
step_id: structured_page_info
66+
67+
# Step 4: Extract forms and input fields
68+
- tool_name: execute_browser_script
69+
arguments:
70+
selector: "role:Window|name:Chrome"
71+
script: |
72+
// Extract all forms and their inputs
73+
({
74+
formCount: document.forms.length,
75+
forms: Array.from(document.forms).map(form => ({
76+
id: form.id || null,
77+
name: form.name || null,
78+
action: form.action,
79+
method: form.method.toUpperCase(),
80+
target: form.target || '_self',
81+
82+
// Extract all form inputs
83+
inputs: Array.from(form.elements).map(element => ({
84+
tagName: element.tagName.toLowerCase(),
85+
type: element.type || null,
86+
name: element.name || null,
87+
id: element.id || null,
88+
required: element.required || false,
89+
disabled: element.disabled || false,
90+
// Redact sensitive values
91+
value: element.type === 'password' ? '[REDACTED]' :
92+
(element.value ? element.value.substring(0, 100) : ''),
93+
placeholder: element.placeholder || null
94+
}))
95+
})),
96+
97+
// Also get hidden inputs separately (useful for CSRF tokens, etc)
98+
hiddenInputs: Array.from(document.querySelectorAll('input[type="hidden"]')).map(input => ({
99+
name: input.name,
100+
value: input.value.substring(0, 100), // Truncate long values
101+
form: input.form?.id || null
102+
}))
103+
})
104+
delay_ms: 500
105+
continue_on_error: false
106+
step_id: form_extraction
107+
108+
# Step 5: Extract metadata and SEO information
109+
- tool_name: execute_browser_script
110+
arguments:
111+
selector: "role:Window|name:Chrome"
112+
script: |
113+
// Extract meta tags, Open Graph, and structured data
114+
({
115+
// Standard meta tags
116+
metaTags: Array.from(document.querySelectorAll('meta')).map(meta => ({
117+
name: meta.name || null,
118+
property: meta.getAttribute('property') || null,
119+
content: (meta.content || '').substring(0, 200),
120+
httpEquiv: meta.httpEquiv || null
121+
})).filter(m => m.name || m.property),
122+
123+
// Open Graph tags specifically
124+
openGraph: Array.from(document.querySelectorAll('meta[property^="og:"]')).reduce((acc, meta) => {
125+
const property = meta.getAttribute('property').replace('og:', '');
126+
acc[property] = meta.content;
127+
return acc;
128+
}, {}),
129+
130+
// JSON-LD structured data
131+
jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]'))
132+
.map(script => {
133+
try {
134+
return JSON.parse(script.textContent);
135+
} catch (e) {
136+
return { error: 'Failed to parse JSON-LD', content: script.textContent.substring(0, 100) };
137+
}
138+
}),
139+
140+
// Page title and canonical URL
141+
pageTitle: document.title,
142+
canonical: document.querySelector('link[rel="canonical"]')?.href || null,
143+
144+
// Language and charset
145+
language: document.documentElement.lang || null,
146+
charset: document.characterSet
147+
})
148+
delay_ms: 500
149+
continue_on_error: false
150+
step_id: metadata_extraction
151+
152+
# Step 6: Analyze page structure and content
153+
- tool_name: execute_browser_script
154+
arguments:
155+
selector: "role:Window|name:Chrome"
156+
script: |
157+
// Analyze page structure for content extraction
158+
({
159+
// Document statistics
160+
statistics: {
161+
totalElements: document.querySelectorAll('*').length,
162+
forms: document.forms.length,
163+
links: document.links.length,
164+
images: document.images.length,
165+
scripts: document.scripts.length,
166+
stylesheets: document.styleSheets.length
167+
},
168+
169+
// Heading structure (useful for content hierarchy)
170+
headings: Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6')).map(h => ({
171+
level: parseInt(h.tagName.substring(1)),
172+
text: h.innerText.substring(0, 100),
173+
id: h.id || null,
174+
className: h.className || null
175+
})),
176+
177+
// Links analysis
178+
links: Array.from(document.links).slice(0, 50).map(link => ({
179+
text: link.innerText.substring(0, 50),
180+
href: link.href,
181+
target: link.target || '_self',
182+
rel: link.rel || null,
183+
isExternal: link.hostname !== window.location.hostname
184+
})),
185+
186+
// Images with alt text (accessibility check)
187+
images: Array.from(document.images).slice(0, 20).map(img => ({
188+
src: img.src,
189+
alt: img.alt || '[no alt text]',
190+
width: img.naturalWidth,
191+
height: img.naturalHeight,
192+
loading: img.loading || 'auto'
193+
}))
194+
})
195+
delay_ms: 500
196+
continue_on_error: false
197+
step_id: structure_analysis
198+
199+
# Step 7: Extract clean text content without HTML
200+
- tool_name: execute_browser_script
201+
arguments:
202+
selector: "role:Window|name:Chrome"
203+
script: |
204+
// Get clean, readable text content
205+
// Clone document and remove unwanted elements
206+
const clonedDoc = document.documentElement.cloneNode(true);
207+
208+
// Remove script, style, and other non-content elements
209+
const elementsToRemove = clonedDoc.querySelectorAll(
210+
'script, style, noscript, iframe, object, embed, [hidden], .hidden'
211+
);
212+
elementsToRemove.forEach(el => el.remove());
213+
214+
// Get text content
215+
const cleanText = clonedDoc.innerText || clonedDoc.textContent || '';
216+
217+
({
218+
// Main content text
219+
cleanText: cleanText.substring(0, 5000),
220+
textLength: cleanText.length,
221+
222+
// Extract specific content areas if they exist
223+
mainContent: (() => {
224+
const main = document.querySelector('main, [role="main"], article, .content, #content');
225+
return main ? main.innerText.substring(0, 2000) : null;
226+
})(),
227+
228+
// Navigation text
229+
navigation: (() => {
230+
const nav = document.querySelector('nav, [role="navigation"]');
231+
return nav ? nav.innerText.substring(0, 500) : null;
232+
})(),
233+
234+
// Footer content
235+
footer: (() => {
236+
const footer = document.querySelector('footer, [role="contentinfo"]');
237+
return footer ? footer.innerText.substring(0, 500) : null;
238+
})(),
239+
240+
// Word count estimate
241+
wordCount: cleanText.split(/\s+/).filter(word => word.length > 0).length
242+
})
243+
delay_ms: 500
244+
continue_on_error: false
245+
step_id: clean_text_extraction
246+
247+
# Step 8: Check for specific patterns or elements
248+
- tool_name: execute_browser_script
249+
arguments:
250+
selector: "role:Window|name:Chrome"
251+
script: |
252+
// Check for specific patterns and elements
253+
({
254+
// Check for common frameworks/libraries
255+
frameworks: {
256+
jquery: typeof jQuery !== 'undefined' || typeof $ !== 'undefined',
257+
react: document.querySelector('[data-reactroot], [data-react-root], #root') !== null,
258+
angular: document.querySelector('[ng-app], [data-ng-app], .ng-scope') !== null,
259+
vue: document.querySelector('[data-v-], #app.__vue__') !== null
260+
},
261+
262+
// Check for authentication/user elements
263+
authentication: {
264+
loginForm: document.querySelector('form[action*="login"], form[action*="signin"], #loginForm') !== null,
265+
logoutLink: document.querySelector('a[href*="logout"], a[href*="signout"]') !== null,
266+
userMenu: document.querySelector('[class*="user-menu"], [class*="account"], [id*="user-menu"]') !== null
267+
},
268+
269+
// Check for e-commerce elements
270+
ecommerce: {
271+
addToCart: document.querySelector('[class*="add-to-cart"], [id*="add-to-cart"], button[data-action="add-to-cart"]') !== null,
272+
shoppingCart: document.querySelector('[class*="shopping-cart"], [class*="cart"], [id*="cart"]') !== null,
273+
productPrice: document.querySelector('[class*="price"], [itemprop="price"], .price, .cost') !== null,
274+
checkoutButton: document.querySelector('[href*="checkout"], button[class*="checkout"]') !== null
275+
},
276+
277+
// Check for media elements
278+
media: {
279+
videos: document.querySelectorAll('video').length,
280+
audios: document.querySelectorAll('audio').length,
281+
iframes: document.querySelectorAll('iframe').length,
282+
youtubeEmbeds: document.querySelectorAll('iframe[src*="youtube.com"], iframe[src*="youtu.be"]').length
283+
},
284+
285+
// Check for tracking/analytics
286+
analytics: {
287+
googleAnalytics: typeof ga !== 'undefined' || typeof gtag !== 'undefined',
288+
googleTagManager: document.querySelector('script[src*="googletagmanager.com"]') !== null,
289+
facebookPixel: typeof fbq !== 'undefined'
290+
}
291+
})
292+
delay_ms: 500
293+
continue_on_error: false
294+
step_id: pattern_detection
295+
296+
stop_on_error: false
297+
include_detailed_results: true
298+
299+
# Output parser to summarize all extracted data
300+
output_parser:
301+
ui_tree_source_step_id: pattern_detection
302+
javascript_code: |
303+
// This parser would normally process the UI tree, but for DOM extraction
304+
// we're primarily interested in the browser script results
305+
// Return a summary of what was extracted
306+
return {
307+
summary: "DOM extraction completed successfully",
308+
stepsCompleted: 8,
309+
extractedData: [
310+
"Full HTML DOM",
311+
"Structured page information",
312+
"Forms and inputs",
313+
"Metadata and SEO tags",
314+
"Page structure analysis",
315+
"Clean text content",
316+
"Framework and pattern detection"
317+
]
318+
};

examples/test_remote_workflow_with_browser_script.yml

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,58 @@ arguments:
1515
inputs:
1616
test_url: "https://httpbin.org/html"
1717
script_to_run: |
18-
// Test script that gets page info and creates a simple element
18+
// Enhanced script that demonstrates DOM extraction capabilities
1919
const pageInfo = {
2020
title: document.title,
2121
url: window.location.href,
2222
timestamp: new Date().toISOString()
2323
};
2424
25+
// Get HTML DOM with size management
26+
const html = document.documentElement.outerHTML;
27+
const maxLength = 10000; // Limit for this test
28+
29+
// Extract page structure
30+
const domInfo = {
31+
...pageInfo,
32+
htmlSnippet: html.substring(0, maxLength),
33+
htmlTotalLength: html.length,
34+
35+
// Extract specific elements
36+
headings: Array.from(document.querySelectorAll('h1,h2,h3')).map(h => ({
37+
level: h.tagName,
38+
text: h.innerText
39+
})),
40+
41+
// Count various elements
42+
elementCounts: {
43+
total: document.querySelectorAll('*').length,
44+
links: document.links.length,
45+
images: document.images.length,
46+
forms: document.forms.length
47+
},
48+
49+
// Get meta information
50+
meta: {
51+
description: document.querySelector('meta[name="description"]')?.content || null,
52+
charset: document.characterSet,
53+
language: document.documentElement.lang || null
54+
}
55+
};
56+
2557
// Create a test div to show we can modify the page
2658
const testDiv = document.createElement('div');
2759
testDiv.id = 'terminator-test-element';
28-
testDiv.style.cssText = 'position:fixed;top:10px;right:10px;background:red;color:white;padding:10px;z-index:9999;';
29-
testDiv.textContent = 'Terminator Test: ' + pageInfo.title;
60+
testDiv.style.cssText = 'position:fixed;top:10px;right:10px;background:green;color:white;padding:15px;z-index:9999;border-radius:5px;';
61+
testDiv.innerHTML = `
62+
<strong>Terminator DOM Test</strong><br>
63+
Title: ${pageInfo.title}<br>
64+
Elements: ${domInfo.elementCounts.total}<br>
65+
HTML Size: ${domInfo.htmlTotalLength} bytes
66+
`;
3067
document.body.appendChild(testDiv);
3168
32-
return JSON.stringify(pageInfo);
69+
return JSON.stringify(domInfo, null, 2);
3370
3471
steps:
3572
# Step 1: Install Chrome extension using remote workflow

0 commit comments

Comments
 (0)