Skip to content

Commit aba195b

Browse files
committed
feat: add self-healing extension bridge with auto-recovery
- Replace static singleton with supervised wrapper that detects dead bridges - Bridge automatically recreates when server task crashes - Add health status reporting to /health endpoint - Add monitoring scripts for continuous health checking - Bridge now lazy-initializes on first use - Zero-downtime recovery on failure The bridge will now automatically recover if it crashes, eliminating the need to restart the entire MCP agent when the WebSocket server dies.
1 parent 08c55a9 commit aba195b

File tree

6 files changed

+825
-9
lines changed

6 files changed

+825
-9
lines changed

bridge_monitor.py

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Continuous Bridge Health Monitor with Self-Healing
4+
Monitors WebSocket bridge and MCP server, attempts recovery on failures
5+
"""
6+
7+
import asyncio
8+
import json
9+
import time
10+
import sys
11+
import subprocess
12+
from datetime import datetime
13+
import websockets
14+
import aiohttp
15+
from collections import deque
16+
17+
class BridgeHealthMonitor:
18+
def __init__(self):
19+
self.ws_url = "ws://127.0.0.1:17373"
20+
self.mcp_health_url = "http://127.0.0.1:8080/health"
21+
self.mcp_status_url = "http://127.0.0.1:8080/status"
22+
23+
# Health tracking
24+
self.ws_connected = False
25+
self.ws_last_pong = None
26+
self.mcp_healthy = False
27+
self.consecutive_failures = 0
28+
self.connection_drops = 0
29+
self.total_pings = 0
30+
self.successful_pings = 0
31+
32+
# History for pattern detection
33+
self.error_history = deque(maxlen=10)
34+
self.reconnect_history = deque(maxlen=10)
35+
36+
# Self-healing thresholds
37+
self.MAX_CONSECUTIVE_FAILURES = 3
38+
self.PING_INTERVAL = 5 # seconds
39+
self.HEALTH_CHECK_INTERVAL = 10 # seconds
40+
41+
def log(self, level, message):
42+
"""Structured logging with timestamps"""
43+
timestamp = datetime.now().strftime("%H:%M:%S")
44+
symbols = {"INFO": "ℹ️", "SUCCESS": "✅", "ERROR": "❌", "WARNING": "⚠️", "HEAL": "🔧"}
45+
print(f"[{timestamp}] {symbols.get(level, '•')} {message}")
46+
47+
async def check_mcp_health(self):
48+
"""Check MCP server health endpoints"""
49+
try:
50+
async with aiohttp.ClientSession() as session:
51+
# Check /health
52+
async with session.get(self.mcp_health_url) as resp:
53+
if resp.status == 200:
54+
data = await resp.json()
55+
if data.get("status") == "ok":
56+
self.mcp_healthy = True
57+
else:
58+
self.mcp_healthy = False
59+
60+
# Check /status for details
61+
async with session.get(self.mcp_status_url) as resp:
62+
if resp.status == 200:
63+
status = await resp.json()
64+
return status
65+
except Exception as e:
66+
self.mcp_healthy = False
67+
self.log("ERROR", f"MCP health check failed: {e}")
68+
return None
69+
70+
async def monitor_websocket(self):
71+
"""Continuously monitor WebSocket connection"""
72+
while True:
73+
try:
74+
self.log("INFO", f"Connecting to WebSocket bridge...")
75+
async with websockets.connect(self.ws_url) as ws:
76+
self.ws_connected = True
77+
self.consecutive_failures = 0
78+
self.log("SUCCESS", "WebSocket connected")
79+
80+
# Send hello
81+
await ws.send(json.dumps({"type": "hello", "from": "monitor"}))
82+
83+
# Ping loop
84+
while True:
85+
try:
86+
# Send ping
87+
await ws.send(json.dumps({"action": "ping"}))
88+
self.total_pings += 1
89+
90+
# Wait for pong with timeout
91+
try:
92+
response = await asyncio.wait_for(ws.recv(), timeout=2.0)
93+
msg = json.loads(response)
94+
if msg.get("type") == "pong":
95+
self.ws_last_pong = time.time()
96+
self.successful_pings += 1
97+
except asyncio.TimeoutError:
98+
self.log("WARNING", "Ping timeout - no pong received")
99+
self.consecutive_failures += 1
100+
101+
await asyncio.sleep(self.PING_INTERVAL)
102+
103+
except websockets.ConnectionClosed:
104+
break
105+
106+
except Exception as e:
107+
self.ws_connected = False
108+
self.connection_drops += 1
109+
self.consecutive_failures += 1
110+
self.error_history.append((datetime.now(), str(e)))
111+
self.log("ERROR", f"WebSocket error: {e}")
112+
113+
# Self-healing decision
114+
if self.consecutive_failures >= self.MAX_CONSECUTIVE_FAILURES:
115+
await self.attempt_recovery()
116+
117+
# Exponential backoff
118+
wait_time = min(2 ** self.consecutive_failures, 30)
119+
self.log("INFO", f"Reconnecting in {wait_time}s...")
120+
await asyncio.sleep(wait_time)
121+
122+
async def attempt_recovery(self):
123+
"""Attempt to recover the bridge connection"""
124+
self.log("HEAL", "Initiating self-healing sequence...")
125+
self.reconnect_history.append(datetime.now())
126+
127+
# Step 1: Check if extension needs restart
128+
if len(self.reconnect_history) >= 3:
129+
recent_reconnects = [r for r in self.reconnect_history
130+
if (datetime.now() - r).seconds < 300]
131+
if len(recent_reconnects) >= 3:
132+
self.log("HEAL", "Frequent reconnects detected - extension may need restart")
133+
self.log("WARNING", "Manual intervention recommended: Reload Chrome extension")
134+
135+
# Step 2: Check if port is still bound
136+
try:
137+
result = subprocess.run(
138+
["netstat", "-an"],
139+
capture_output=True,
140+
text=True,
141+
timeout=5
142+
)
143+
if "17373" in result.stdout and "LISTENING" in result.stdout:
144+
self.log("SUCCESS", "Port 17373 is still listening")
145+
else:
146+
self.log("ERROR", "Port 17373 not listening - bridge server may be down")
147+
# Could attempt to restart bridge here if we had permissions
148+
except Exception as e:
149+
self.log("ERROR", f"Failed to check port status: {e}")
150+
151+
# Step 3: Reset failure counter after recovery attempt
152+
self.consecutive_failures = 0
153+
154+
async def display_dashboard(self):
155+
"""Display health dashboard every 10 seconds"""
156+
while True:
157+
await asyncio.sleep(self.HEALTH_CHECK_INTERVAL)
158+
159+
# Check MCP health
160+
mcp_status = await self.check_mcp_health()
161+
162+
# Calculate metrics
163+
ping_success_rate = (self.successful_pings / max(self.total_pings, 1)) * 100
164+
uptime = "Connected" if self.ws_connected else "Disconnected"
165+
166+
# Display dashboard
167+
print("\n" + "="*60)
168+
print("📊 BRIDGE HEALTH DASHBOARD")
169+
print("="*60)
170+
print(f"WebSocket Bridge: {('✅ ' + uptime) if self.ws_connected else '❌ Disconnected'}")
171+
print(f"MCP Server: {'✅ Healthy' if self.mcp_healthy else '❌ Unhealthy'}")
172+
print(f"Ping Success Rate: {ping_success_rate:.1f}% ({self.successful_pings}/{self.total_pings})")
173+
print(f"Connection Drops: {self.connection_drops}")
174+
print(f"Consecutive Fails: {self.consecutive_failures}")
175+
176+
if mcp_status:
177+
print(f"MCP Active Reqs: {mcp_status.get('activeRequests', 0)}")
178+
print(f"MCP Busy: {'Yes' if mcp_status.get('busy') else 'No'}")
179+
180+
if self.ws_last_pong:
181+
last_pong_ago = int(time.time() - self.ws_last_pong)
182+
print(f"Last Pong: {last_pong_ago}s ago")
183+
184+
# Show recent errors
185+
if self.error_history:
186+
print("\nRecent Errors:")
187+
for timestamp, error in list(self.error_history)[-3:]:
188+
print(f" [{timestamp.strftime('%H:%M:%S')}] {error[:50]}")
189+
190+
print("="*60)
191+
192+
# Health score
193+
health_score = 0
194+
if self.ws_connected: health_score += 40
195+
if self.mcp_healthy: health_score += 30
196+
if ping_success_rate > 90: health_score += 20
197+
if self.consecutive_failures == 0: health_score += 10
198+
199+
if health_score >= 90:
200+
self.log("SUCCESS", f"System Health: EXCELLENT ({health_score}%)")
201+
elif health_score >= 70:
202+
self.log("INFO", f"System Health: GOOD ({health_score}%)")
203+
elif health_score >= 50:
204+
self.log("WARNING", f"System Health: DEGRADED ({health_score}%)")
205+
else:
206+
self.log("ERROR", f"System Health: CRITICAL ({health_score}%)")
207+
208+
async def run(self):
209+
"""Run all monitoring tasks concurrently"""
210+
self.log("INFO", "Starting Bridge Health Monitor v1.0")
211+
self.log("INFO", f"Monitoring WebSocket: {self.ws_url}")
212+
self.log("INFO", f"Monitoring MCP: {self.mcp_health_url}")
213+
print("="*60)
214+
215+
# Run monitoring tasks
216+
tasks = [
217+
asyncio.create_task(self.monitor_websocket()),
218+
asyncio.create_task(self.display_dashboard())
219+
]
220+
221+
try:
222+
await asyncio.gather(*tasks)
223+
except KeyboardInterrupt:
224+
self.log("INFO", "Shutting down monitor...")
225+
for task in tasks:
226+
task.cancel()
227+
228+
async def main():
229+
monitor = BridgeHealthMonitor()
230+
await monitor.run()
231+
232+
if __name__ == "__main__":
233+
try:
234+
asyncio.run(main())
235+
except KeyboardInterrupt:
236+
print("\n👋 Monitor stopped")
237+
sys.exit(0)

0 commit comments

Comments
 (0)