mediar-ai
diff --git a/‎examples/browser_dom_extraction.yml‎
Lines changed: 318 additions & 0 deletions b/‎examples/browser_dom_extraction.yml‎
Lines changed: 318 additions & 0 deletions
diff --git a/‎examples/test_remote_workflow_with_browser_script.yml‎
Lines changed: 41 additions & 4 deletions b/‎examples/test_remote_workflow_with_browser_script.yml‎
Lines changed: 41 additions & 4 deletions
@@ -0,0 +1,318 @@
+---
+# Browser DOM Extraction Examples
+# This workflow demonstrates various patterns for extracting HTML DOM data using execute_browser_script
+tool_name: execute_sequence
+arguments:
+  variables:
+    test_url:
+      type: string
+      label: "URL to extract DOM from"
+      default: "https://example.com"
+
+  inputs:
+    test_url: "https://example.com"
+
+  steps:
+    # Step 1: Navigate to the target page
+    - tool_name: navigate_browser
+      arguments:
+        url: "${{inputs.test_url}}"
+        browser: "chrome"
+      delay_ms: 3000
+      continue_on_error: false
+
+    # Step 2: Get full HTML DOM (simple extraction)
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Simple full DOM extraction
+          document.documentElement.outerHTML
+      delay_ms: 500
+      continue_on_error: true
+      step_id: full_dom_extraction
+
+    # Step 3: Get structured page information with size management
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Structured extraction with truncation for large DOMs
+          const html = document.documentElement.outerHTML;
+          const maxLength = 30000; // MCP response size limit
+          
+          ({
+            // Basic page info
+            url: window.location.href,
+            title: document.title,
+            description: document.querySelector('meta[name="description"]')?.content || '',
+            
+            // HTML with size management
+            html: html.length > maxLength 
+              ? html.substring(0, maxLength) + '... [truncated]'
+              : html,
+            htmlLength: html.length,
+            wasTruncated: html.length > maxLength,
+            
+            // Text content preview
+            bodyText: document.body.innerText.substring(0, 500),
+            
+            // Page metrics
+            timestamp: new Date().toISOString()
+          })
+      delay_ms: 500
+      continue_on_error: false
+      step_id: structured_page_info
+
+    # Step 4: Extract forms and input fields
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Extract all forms and their inputs
+          ({
+            formCount: document.forms.length,
+            forms: Array.from(document.forms).map(form => ({
+              id: form.id || null,
+              name: form.name || null,
+              action: form.action,
+              method: form.method.toUpperCase(),
+              target: form.target || '_self',
+              
+              // Extract all form inputs
+              inputs: Array.from(form.elements).map(element => ({
+                tagName: element.tagName.toLowerCase(),
+                type: element.type || null,
+                name: element.name || null,
+                id: element.id || null,
+                required: element.required || false,
+                disabled: element.disabled || false,
+                // Redact sensitive values
+                value: element.type === 'password' ? '[REDACTED]' : 
+                       (element.value ? element.value.substring(0, 100) : ''),
+                placeholder: element.placeholder || null
+              }))
+            })),
+            
+            // Also get hidden inputs separately (useful for CSRF tokens, etc)
+            hiddenInputs: Array.from(document.querySelectorAll('input[type="hidden"]')).map(input => ({
+              name: input.name,
+              value: input.value.substring(0, 100), // Truncate long values
+              form: input.form?.id || null
+            }))
+          })
+      delay_ms: 500
+      continue_on_error: false
+      step_id: form_extraction
+
+    # Step 5: Extract metadata and SEO information
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Extract meta tags, Open Graph, and structured data
+          ({
+            // Standard meta tags
+            metaTags: Array.from(document.querySelectorAll('meta')).map(meta => ({
+              name: meta.name || null,
+              property: meta.getAttribute('property') || null,
+              content: (meta.content || '').substring(0, 200),
+              httpEquiv: meta.httpEquiv || null
+            })).filter(m => m.name || m.property),
+            
+            // Open Graph tags specifically
+            openGraph: Array.from(document.querySelectorAll('meta[property^="og:"]')).reduce((acc, meta) => {
+              const property = meta.getAttribute('property').replace('og:', '');
+              acc[property] = meta.content;
+              return acc;
+            }, {}),
+            
+            // JSON-LD structured data
+            jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]'))
+              .map(script => {
+                try {
+                  return JSON.parse(script.textContent);
+                } catch (e) {
+                  return { error: 'Failed to parse JSON-LD', content: script.textContent.substring(0, 100) };
+                }
+              }),
+            
+            // Page title and canonical URL
+            pageTitle: document.title,
+            canonical: document.querySelector('link[rel="canonical"]')?.href || null,
+            
+            // Language and charset
+            language: document.documentElement.lang || null,
+            charset: document.characterSet
+          })
+      delay_ms: 500
+      continue_on_error: false
+      step_id: metadata_extraction
+
+    # Step 6: Analyze page structure and content
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Analyze page structure for content extraction
+          ({
+            // Document statistics
+            statistics: {
+              totalElements: document.querySelectorAll('*').length,
+              forms: document.forms.length,
+              links: document.links.length,
+              images: document.images.length,
+              scripts: document.scripts.length,
+              stylesheets: document.styleSheets.length
+            },
+            
+            // Heading structure (useful for content hierarchy)
+            headings: Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6')).map(h => ({
+              level: parseInt(h.tagName.substring(1)),
+              text: h.innerText.substring(0, 100),
+              id: h.id || null,
+              className: h.className || null
+            })),
+            
+            // Links analysis
+            links: Array.from(document.links).slice(0, 50).map(link => ({
+              text: link.innerText.substring(0, 50),
+              href: link.href,
+              target: link.target || '_self',
+              rel: link.rel || null,
+              isExternal: link.hostname !== window.location.hostname
+            })),
+            
+            // Images with alt text (accessibility check)
+            images: Array.from(document.images).slice(0, 20).map(img => ({
+              src: img.src,
+              alt: img.alt || '[no alt text]',
+              width: img.naturalWidth,
+              height: img.naturalHeight,
+              loading: img.loading || 'auto'
+            }))
+          })
+      delay_ms: 500
+      continue_on_error: false
+      step_id: structure_analysis
+
+    # Step 7: Extract clean text content without HTML
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Get clean, readable text content
+          // Clone document and remove unwanted elements
+          const clonedDoc = document.documentElement.cloneNode(true);
+          
+          // Remove script, style, and other non-content elements
+          const elementsToRemove = clonedDoc.querySelectorAll(
+            'script, style, noscript, iframe, object, embed, [hidden], .hidden'
+          );
+          elementsToRemove.forEach(el => el.remove());
+          
+          // Get text content
+          const cleanText = clonedDoc.innerText || clonedDoc.textContent || '';
+          
+          ({
+            // Main content text
+            cleanText: cleanText.substring(0, 5000),
+            textLength: cleanText.length,
+            
+            // Extract specific content areas if they exist
+            mainContent: (() => {
+              const main = document.querySelector('main, [role="main"], article, .content, #content');
+              return main ? main.innerText.substring(0, 2000) : null;
+            })(),
+            
+            // Navigation text
+            navigation: (() => {
+              const nav = document.querySelector('nav, [role="navigation"]');
+              return nav ? nav.innerText.substring(0, 500) : null;
+            })(),
+            
+            // Footer content
+            footer: (() => {
+              const footer = document.querySelector('footer, [role="contentinfo"]');
+              return footer ? footer.innerText.substring(0, 500) : null;
+            })(),
+            
+            // Word count estimate
+            wordCount: cleanText.split(/\s+/).filter(word => word.length > 0).length
+          })
+      delay_ms: 500
+      continue_on_error: false
+      step_id: clean_text_extraction
+
+    # Step 8: Check for specific patterns or elements
+    - tool_name: execute_browser_script
+      arguments:
+        selector: "role:Window|name:Chrome"
+        script: |
+          // Check for specific patterns and elements
+          ({
+            // Check for common frameworks/libraries
+            frameworks: {
+              jquery: typeof jQuery !== 'undefined' || typeof $ !== 'undefined',
+              react: document.querySelector('[data-reactroot], [data-react-root], #root') !== null,
+              angular: document.querySelector('[ng-app], [data-ng-app], .ng-scope') !== null,
+              vue: document.querySelector('[data-v-], #app.__vue__') !== null
+            },
+            
+            // Check for authentication/user elements
+            authentication: {
+              loginForm: document.querySelector('form[action*="login"], form[action*="signin"], #loginForm') !== null,
+              logoutLink: document.querySelector('a[href*="logout"], a[href*="signout"]') !== null,
+              userMenu: document.querySelector('[class*="user-menu"], [class*="account"], [id*="user-menu"]') !== null
+            },
+            
+            // Check for e-commerce elements
+            ecommerce: {
+              addToCart: document.querySelector('[class*="add-to-cart"], [id*="add-to-cart"], button[data-action="add-to-cart"]') !== null,
+              shoppingCart: document.querySelector('[class*="shopping-cart"], [class*="cart"], [id*="cart"]') !== null,
+              productPrice: document.querySelector('[class*="price"], [itemprop="price"], .price, .cost') !== null,
+              checkoutButton: document.querySelector('[href*="checkout"], button[class*="checkout"]') !== null
+            },
+            
+            // Check for media elements
+            media: {
+              videos: document.querySelectorAll('video').length,
+              audios: document.querySelectorAll('audio').length,
+              iframes: document.querySelectorAll('iframe').length,
+              youtubeEmbeds: document.querySelectorAll('iframe[src*="youtube.com"], iframe[src*="youtu.be"]').length
+            },
+            
+            // Check for tracking/analytics
+            analytics: {
+              googleAnalytics: typeof ga !== 'undefined' || typeof gtag !== 'undefined',
+              googleTagManager: document.querySelector('script[src*="googletagmanager.com"]') !== null,
+              facebookPixel: typeof fbq !== 'undefined'
+            }
+          })
+      delay_ms: 500
+      continue_on_error: false
+      step_id: pattern_detection
+
+  stop_on_error: false
+  include_detailed_results: true
+
+  # Output parser to summarize all extracted data
+  output_parser:
+    ui_tree_source_step_id: pattern_detection
+    javascript_code: |
+      // This parser would normally process the UI tree, but for DOM extraction
+      // we're primarily interested in the browser script results
+      // Return a summary of what was extracted
+      return {
+        summary: "DOM extraction completed successfully",
+        stepsCompleted: 8,
+        extractedData: [
+          "Full HTML DOM",
+          "Structured page information",
+          "Forms and inputs",
+          "Metadata and SEO tags",
+          "Page structure analysis",
+          "Clean text content",
+          "Framework and pattern detection"
+        ]
+      };
@@ -15,21 +15,58 @@ arguments:
   inputs:
     test_url: "https://httpbin.org/html"
     script_to_run: |
-      // Test script that gets page info and creates a simple element
+      // Enhanced script that demonstrates DOM extraction capabilities
       const pageInfo = {
         title: document.title,
         url: window.location.href,
         timestamp: new Date().toISOString()
       };
 
+      // Get HTML DOM with size management
+      const html = document.documentElement.outerHTML;
+      const maxLength = 10000; // Limit for this test
+      
+      // Extract page structure
+      const domInfo = {
+        ...pageInfo,
+        htmlSnippet: html.substring(0, maxLength),
+        htmlTotalLength: html.length,
+        
+        // Extract specific elements
+        headings: Array.from(document.querySelectorAll('h1,h2,h3')).map(h => ({
+          level: h.tagName,
+          text: h.innerText
+        })),
+        
+        // Count various elements
+        elementCounts: {
+          total: document.querySelectorAll('*').length,
+          links: document.links.length,
+          images: document.images.length,
+          forms: document.forms.length
+        },
+        
+        // Get meta information
+        meta: {
+          description: document.querySelector('meta[name="description"]')?.content || null,
+          charset: document.characterSet,
+          language: document.documentElement.lang || null
+        }
+      };
+
       // Create a test div to show we can modify the page
       const testDiv = document.createElement('div');
       testDiv.id = 'terminator-test-element';
-      testDiv.style.cssText = 'position:fixed;top:10px;right:10px;background:red;color:white;padding:10px;z-index:9999;';
-      testDiv.textContent = 'Terminator Test: ' + pageInfo.title;
+      testDiv.style.cssText = 'position:fixed;top:10px;right:10px;background:green;color:white;padding:15px;z-index:9999;border-radius:5px;';
+      testDiv.innerHTML = `
+        <strong>Terminator DOM Test</strong><br>
+        Title: ${pageInfo.title}<br>
+        Elements: ${domInfo.elementCounts.total}<br>
+        HTML Size: ${domInfo.htmlTotalLength} bytes
+      `;
       document.body.appendChild(testDiv);
 
-      return JSON.stringify(pageInfo);
+      return JSON.stringify(domInfo, null, 2);
 
   steps:
     # Step 1: Install Chrome extension using remote workflow