1
+ ---
2
+ # Browser DOM Extraction Examples
3
+ # This workflow demonstrates various patterns for extracting HTML DOM data using execute_browser_script
4
+ tool_name : execute_sequence
5
+ arguments :
6
+ variables :
7
+ test_url :
8
+ type : string
9
+ label : " URL to extract DOM from"
10
+ default : " https://example.com"
11
+
12
+ inputs :
13
+ test_url : " https://example.com"
14
+
15
+ steps :
16
+ # Step 1: Navigate to the target page
17
+ - tool_name : navigate_browser
18
+ arguments :
19
+ url : " ${{inputs.test_url}}"
20
+ browser : " chrome"
21
+ delay_ms : 3000
22
+ continue_on_error : false
23
+
24
+ # Step 2: Get full HTML DOM (simple extraction)
25
+ - tool_name : execute_browser_script
26
+ arguments :
27
+ selector : " role:Window|name:Chrome"
28
+ script : |
29
+ // Simple full DOM extraction
30
+ document.documentElement.outerHTML
31
+ delay_ms : 500
32
+ continue_on_error : true
33
+ step_id : full_dom_extraction
34
+
35
+ # Step 3: Get structured page information with size management
36
+ - tool_name : execute_browser_script
37
+ arguments :
38
+ selector : " role:Window|name:Chrome"
39
+ script : |
40
+ // Structured extraction with truncation for large DOMs
41
+ const html = document.documentElement.outerHTML;
42
+ const maxLength = 30000; // MCP response size limit
43
+
44
+ ({
45
+ // Basic page info
46
+ url: window.location.href,
47
+ title: document.title,
48
+ description: document.querySelector('meta[name="description"]')?.content || '',
49
+
50
+ // HTML with size management
51
+ html: html.length > maxLength
52
+ ? html.substring(0, maxLength) + '... [truncated]'
53
+ : html,
54
+ htmlLength: html.length,
55
+ wasTruncated: html.length > maxLength,
56
+
57
+ // Text content preview
58
+ bodyText: document.body.innerText.substring(0, 500),
59
+
60
+ // Page metrics
61
+ timestamp: new Date().toISOString()
62
+ })
63
+ delay_ms : 500
64
+ continue_on_error : false
65
+ step_id : structured_page_info
66
+
67
+ # Step 4: Extract forms and input fields
68
+ - tool_name : execute_browser_script
69
+ arguments :
70
+ selector : " role:Window|name:Chrome"
71
+ script : |
72
+ // Extract all forms and their inputs
73
+ ({
74
+ formCount: document.forms.length,
75
+ forms: Array.from(document.forms).map(form => ({
76
+ id: form.id || null,
77
+ name: form.name || null,
78
+ action: form.action,
79
+ method: form.method.toUpperCase(),
80
+ target: form.target || '_self',
81
+
82
+ // Extract all form inputs
83
+ inputs: Array.from(form.elements).map(element => ({
84
+ tagName: element.tagName.toLowerCase(),
85
+ type: element.type || null,
86
+ name: element.name || null,
87
+ id: element.id || null,
88
+ required: element.required || false,
89
+ disabled: element.disabled || false,
90
+ // Redact sensitive values
91
+ value: element.type === 'password' ? '[REDACTED]' :
92
+ (element.value ? element.value.substring(0, 100) : ''),
93
+ placeholder: element.placeholder || null
94
+ }))
95
+ })),
96
+
97
+ // Also get hidden inputs separately (useful for CSRF tokens, etc)
98
+ hiddenInputs: Array.from(document.querySelectorAll('input[type="hidden"]')).map(input => ({
99
+ name: input.name,
100
+ value: input.value.substring(0, 100), // Truncate long values
101
+ form: input.form?.id || null
102
+ }))
103
+ })
104
+ delay_ms : 500
105
+ continue_on_error : false
106
+ step_id : form_extraction
107
+
108
+ # Step 5: Extract metadata and SEO information
109
+ - tool_name : execute_browser_script
110
+ arguments :
111
+ selector : " role:Window|name:Chrome"
112
+ script : |
113
+ // Extract meta tags, Open Graph, and structured data
114
+ ({
115
+ // Standard meta tags
116
+ metaTags: Array.from(document.querySelectorAll('meta')).map(meta => ({
117
+ name: meta.name || null,
118
+ property: meta.getAttribute('property') || null,
119
+ content: (meta.content || '').substring(0, 200),
120
+ httpEquiv: meta.httpEquiv || null
121
+ })).filter(m => m.name || m.property),
122
+
123
+ // Open Graph tags specifically
124
+ openGraph: Array.from(document.querySelectorAll('meta[property^="og:"]')).reduce((acc, meta) => {
125
+ const property = meta.getAttribute('property').replace('og:', '');
126
+ acc[property] = meta.content;
127
+ return acc;
128
+ }, {}),
129
+
130
+ // JSON-LD structured data
131
+ jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]'))
132
+ .map(script => {
133
+ try {
134
+ return JSON.parse(script.textContent);
135
+ } catch (e) {
136
+ return { error: 'Failed to parse JSON-LD', content: script.textContent.substring(0, 100) };
137
+ }
138
+ }),
139
+
140
+ // Page title and canonical URL
141
+ pageTitle: document.title,
142
+ canonical: document.querySelector('link[rel="canonical"]')?.href || null,
143
+
144
+ // Language and charset
145
+ language: document.documentElement.lang || null,
146
+ charset: document.characterSet
147
+ })
148
+ delay_ms : 500
149
+ continue_on_error : false
150
+ step_id : metadata_extraction
151
+
152
+ # Step 6: Analyze page structure and content
153
+ - tool_name : execute_browser_script
154
+ arguments :
155
+ selector : " role:Window|name:Chrome"
156
+ script : |
157
+ // Analyze page structure for content extraction
158
+ ({
159
+ // Document statistics
160
+ statistics: {
161
+ totalElements: document.querySelectorAll('*').length,
162
+ forms: document.forms.length,
163
+ links: document.links.length,
164
+ images: document.images.length,
165
+ scripts: document.scripts.length,
166
+ stylesheets: document.styleSheets.length
167
+ },
168
+
169
+ // Heading structure (useful for content hierarchy)
170
+ headings: Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6')).map(h => ({
171
+ level: parseInt(h.tagName.substring(1)),
172
+ text: h.innerText.substring(0, 100),
173
+ id: h.id || null,
174
+ className: h.className || null
175
+ })),
176
+
177
+ // Links analysis
178
+ links: Array.from(document.links).slice(0, 50).map(link => ({
179
+ text: link.innerText.substring(0, 50),
180
+ href: link.href,
181
+ target: link.target || '_self',
182
+ rel: link.rel || null,
183
+ isExternal: link.hostname !== window.location.hostname
184
+ })),
185
+
186
+ // Images with alt text (accessibility check)
187
+ images: Array.from(document.images).slice(0, 20).map(img => ({
188
+ src: img.src,
189
+ alt: img.alt || '[no alt text]',
190
+ width: img.naturalWidth,
191
+ height: img.naturalHeight,
192
+ loading: img.loading || 'auto'
193
+ }))
194
+ })
195
+ delay_ms : 500
196
+ continue_on_error : false
197
+ step_id : structure_analysis
198
+
199
+ # Step 7: Extract clean text content without HTML
200
+ - tool_name : execute_browser_script
201
+ arguments :
202
+ selector : " role:Window|name:Chrome"
203
+ script : |
204
+ // Get clean, readable text content
205
+ // Clone document and remove unwanted elements
206
+ const clonedDoc = document.documentElement.cloneNode(true);
207
+
208
+ // Remove script, style, and other non-content elements
209
+ const elementsToRemove = clonedDoc.querySelectorAll(
210
+ 'script, style, noscript, iframe, object, embed, [hidden], .hidden'
211
+ );
212
+ elementsToRemove.forEach(el => el.remove());
213
+
214
+ // Get text content
215
+ const cleanText = clonedDoc.innerText || clonedDoc.textContent || '';
216
+
217
+ ({
218
+ // Main content text
219
+ cleanText: cleanText.substring(0, 5000),
220
+ textLength: cleanText.length,
221
+
222
+ // Extract specific content areas if they exist
223
+ mainContent: (() => {
224
+ const main = document.querySelector('main, [role="main"], article, .content, #content');
225
+ return main ? main.innerText.substring(0, 2000) : null;
226
+ })(),
227
+
228
+ // Navigation text
229
+ navigation: (() => {
230
+ const nav = document.querySelector('nav, [role="navigation"]');
231
+ return nav ? nav.innerText.substring(0, 500) : null;
232
+ })(),
233
+
234
+ // Footer content
235
+ footer: (() => {
236
+ const footer = document.querySelector('footer, [role="contentinfo"]');
237
+ return footer ? footer.innerText.substring(0, 500) : null;
238
+ })(),
239
+
240
+ // Word count estimate
241
+ wordCount: cleanText.split(/\s+/).filter(word => word.length > 0).length
242
+ })
243
+ delay_ms : 500
244
+ continue_on_error : false
245
+ step_id : clean_text_extraction
246
+
247
+ # Step 8: Check for specific patterns or elements
248
+ - tool_name : execute_browser_script
249
+ arguments :
250
+ selector : " role:Window|name:Chrome"
251
+ script : |
252
+ // Check for specific patterns and elements
253
+ ({
254
+ // Check for common frameworks/libraries
255
+ frameworks: {
256
+ jquery: typeof jQuery !== 'undefined' || typeof $ !== 'undefined',
257
+ react: document.querySelector('[data-reactroot], [data-react-root], #root') !== null,
258
+ angular: document.querySelector('[ng-app], [data-ng-app], .ng-scope') !== null,
259
+ vue: document.querySelector('[data-v-], #app.__vue__') !== null
260
+ },
261
+
262
+ // Check for authentication/user elements
263
+ authentication: {
264
+ loginForm: document.querySelector('form[action*="login"], form[action*="signin"], #loginForm') !== null,
265
+ logoutLink: document.querySelector('a[href*="logout"], a[href*="signout"]') !== null,
266
+ userMenu: document.querySelector('[class*="user-menu"], [class*="account"], [id*="user-menu"]') !== null
267
+ },
268
+
269
+ // Check for e-commerce elements
270
+ ecommerce: {
271
+ addToCart: document.querySelector('[class*="add-to-cart"], [id*="add-to-cart"], button[data-action="add-to-cart"]') !== null,
272
+ shoppingCart: document.querySelector('[class*="shopping-cart"], [class*="cart"], [id*="cart"]') !== null,
273
+ productPrice: document.querySelector('[class*="price"], [itemprop="price"], .price, .cost') !== null,
274
+ checkoutButton: document.querySelector('[href*="checkout"], button[class*="checkout"]') !== null
275
+ },
276
+
277
+ // Check for media elements
278
+ media: {
279
+ videos: document.querySelectorAll('video').length,
280
+ audios: document.querySelectorAll('audio').length,
281
+ iframes: document.querySelectorAll('iframe').length,
282
+ youtubeEmbeds: document.querySelectorAll('iframe[src*="youtube.com"], iframe[src*="youtu.be"]').length
283
+ },
284
+
285
+ // Check for tracking/analytics
286
+ analytics: {
287
+ googleAnalytics: typeof ga !== 'undefined' || typeof gtag !== 'undefined',
288
+ googleTagManager: document.querySelector('script[src*="googletagmanager.com"]') !== null,
289
+ facebookPixel: typeof fbq !== 'undefined'
290
+ }
291
+ })
292
+ delay_ms : 500
293
+ continue_on_error : false
294
+ step_id : pattern_detection
295
+
296
+ stop_on_error : false
297
+ include_detailed_results : true
298
+
299
+ # Output parser to summarize all extracted data
300
+ output_parser :
301
+ ui_tree_source_step_id : pattern_detection
302
+ javascript_code : |
303
+ // This parser would normally process the UI tree, but for DOM extraction
304
+ // we're primarily interested in the browser script results
305
+ // Return a summary of what was extracted
306
+ return {
307
+ summary: "DOM extraction completed successfully",
308
+ stepsCompleted: 8,
309
+ extractedData: [
310
+ "Full HTML DOM",
311
+ "Structured page information",
312
+ "Forms and inputs",
313
+ "Metadata and SEO tags",
314
+ "Page structure analysis",
315
+ "Clean text content",
316
+ "Framework and pattern detection"
317
+ ]
318
+ };
0 commit comments