1
+ #!/usr/bin/env python3
2
+ """
3
+ Continuous Bridge Health Monitor with Self-Healing
4
+ Monitors WebSocket bridge and MCP server, attempts recovery on failures
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import time
10
+ import sys
11
+ import subprocess
12
+ from datetime import datetime
13
+ import websockets
14
+ import aiohttp
15
+ from collections import deque
16
+
17
+ class BridgeHealthMonitor :
18
+ def __init__ (self ):
19
+ self .ws_url = "ws://127.0.0.1:17373"
20
+ self .mcp_health_url = "http://127.0.0.1:8080/health"
21
+ self .mcp_status_url = "http://127.0.0.1:8080/status"
22
+
23
+ # Health tracking
24
+ self .ws_connected = False
25
+ self .ws_last_pong = None
26
+ self .mcp_healthy = False
27
+ self .consecutive_failures = 0
28
+ self .connection_drops = 0
29
+ self .total_pings = 0
30
+ self .successful_pings = 0
31
+
32
+ # History for pattern detection
33
+ self .error_history = deque (maxlen = 10 )
34
+ self .reconnect_history = deque (maxlen = 10 )
35
+
36
+ # Self-healing thresholds
37
+ self .MAX_CONSECUTIVE_FAILURES = 3
38
+ self .PING_INTERVAL = 5 # seconds
39
+ self .HEALTH_CHECK_INTERVAL = 10 # seconds
40
+
41
+ def log (self , level , message ):
42
+ """Structured logging with timestamps"""
43
+ timestamp = datetime .now ().strftime ("%H:%M:%S" )
44
+ symbols = {"INFO" : "ℹ️" , "SUCCESS" : "✅" , "ERROR" : "❌" , "WARNING" : "⚠️" , "HEAL" : "🔧" }
45
+ print (f"[{ timestamp } ] { symbols .get (level , '•' )} { message } " )
46
+
47
+ async def check_mcp_health (self ):
48
+ """Check MCP server health endpoints"""
49
+ try :
50
+ async with aiohttp .ClientSession () as session :
51
+ # Check /health
52
+ async with session .get (self .mcp_health_url ) as resp :
53
+ if resp .status == 200 :
54
+ data = await resp .json ()
55
+ if data .get ("status" ) == "ok" :
56
+ self .mcp_healthy = True
57
+ else :
58
+ self .mcp_healthy = False
59
+
60
+ # Check /status for details
61
+ async with session .get (self .mcp_status_url ) as resp :
62
+ if resp .status == 200 :
63
+ status = await resp .json ()
64
+ return status
65
+ except Exception as e :
66
+ self .mcp_healthy = False
67
+ self .log ("ERROR" , f"MCP health check failed: { e } " )
68
+ return None
69
+
70
+ async def monitor_websocket (self ):
71
+ """Continuously monitor WebSocket connection"""
72
+ while True :
73
+ try :
74
+ self .log ("INFO" , f"Connecting to WebSocket bridge..." )
75
+ async with websockets .connect (self .ws_url ) as ws :
76
+ self .ws_connected = True
77
+ self .consecutive_failures = 0
78
+ self .log ("SUCCESS" , "WebSocket connected" )
79
+
80
+ # Send hello
81
+ await ws .send (json .dumps ({"type" : "hello" , "from" : "monitor" }))
82
+
83
+ # Ping loop
84
+ while True :
85
+ try :
86
+ # Send ping
87
+ await ws .send (json .dumps ({"action" : "ping" }))
88
+ self .total_pings += 1
89
+
90
+ # Wait for pong with timeout
91
+ try :
92
+ response = await asyncio .wait_for (ws .recv (), timeout = 2.0 )
93
+ msg = json .loads (response )
94
+ if msg .get ("type" ) == "pong" :
95
+ self .ws_last_pong = time .time ()
96
+ self .successful_pings += 1
97
+ except asyncio .TimeoutError :
98
+ self .log ("WARNING" , "Ping timeout - no pong received" )
99
+ self .consecutive_failures += 1
100
+
101
+ await asyncio .sleep (self .PING_INTERVAL )
102
+
103
+ except websockets .ConnectionClosed :
104
+ break
105
+
106
+ except Exception as e :
107
+ self .ws_connected = False
108
+ self .connection_drops += 1
109
+ self .consecutive_failures += 1
110
+ self .error_history .append ((datetime .now (), str (e )))
111
+ self .log ("ERROR" , f"WebSocket error: { e } " )
112
+
113
+ # Self-healing decision
114
+ if self .consecutive_failures >= self .MAX_CONSECUTIVE_FAILURES :
115
+ await self .attempt_recovery ()
116
+
117
+ # Exponential backoff
118
+ wait_time = min (2 ** self .consecutive_failures , 30 )
119
+ self .log ("INFO" , f"Reconnecting in { wait_time } s..." )
120
+ await asyncio .sleep (wait_time )
121
+
122
+ async def attempt_recovery (self ):
123
+ """Attempt to recover the bridge connection"""
124
+ self .log ("HEAL" , "Initiating self-healing sequence..." )
125
+ self .reconnect_history .append (datetime .now ())
126
+
127
+ # Step 1: Check if extension needs restart
128
+ if len (self .reconnect_history ) >= 3 :
129
+ recent_reconnects = [r for r in self .reconnect_history
130
+ if (datetime .now () - r ).seconds < 300 ]
131
+ if len (recent_reconnects ) >= 3 :
132
+ self .log ("HEAL" , "Frequent reconnects detected - extension may need restart" )
133
+ self .log ("WARNING" , "Manual intervention recommended: Reload Chrome extension" )
134
+
135
+ # Step 2: Check if port is still bound
136
+ try :
137
+ result = subprocess .run (
138
+ ["netstat" , "-an" ],
139
+ capture_output = True ,
140
+ text = True ,
141
+ timeout = 5
142
+ )
143
+ if "17373" in result .stdout and "LISTENING" in result .stdout :
144
+ self .log ("SUCCESS" , "Port 17373 is still listening" )
145
+ else :
146
+ self .log ("ERROR" , "Port 17373 not listening - bridge server may be down" )
147
+ # Could attempt to restart bridge here if we had permissions
148
+ except Exception as e :
149
+ self .log ("ERROR" , f"Failed to check port status: { e } " )
150
+
151
+ # Step 3: Reset failure counter after recovery attempt
152
+ self .consecutive_failures = 0
153
+
154
+ async def display_dashboard (self ):
155
+ """Display health dashboard every 10 seconds"""
156
+ while True :
157
+ await asyncio .sleep (self .HEALTH_CHECK_INTERVAL )
158
+
159
+ # Check MCP health
160
+ mcp_status = await self .check_mcp_health ()
161
+
162
+ # Calculate metrics
163
+ ping_success_rate = (self .successful_pings / max (self .total_pings , 1 )) * 100
164
+ uptime = "Connected" if self .ws_connected else "Disconnected"
165
+
166
+ # Display dashboard
167
+ print ("\n " + "=" * 60 )
168
+ print ("📊 BRIDGE HEALTH DASHBOARD" )
169
+ print ("=" * 60 )
170
+ print (f"WebSocket Bridge: { ('✅ ' + uptime ) if self .ws_connected else '❌ Disconnected' } " )
171
+ print (f"MCP Server: { '✅ Healthy' if self .mcp_healthy else '❌ Unhealthy' } " )
172
+ print (f"Ping Success Rate: { ping_success_rate :.1f} % ({ self .successful_pings } /{ self .total_pings } )" )
173
+ print (f"Connection Drops: { self .connection_drops } " )
174
+ print (f"Consecutive Fails: { self .consecutive_failures } " )
175
+
176
+ if mcp_status :
177
+ print (f"MCP Active Reqs: { mcp_status .get ('activeRequests' , 0 )} " )
178
+ print (f"MCP Busy: { 'Yes' if mcp_status .get ('busy' ) else 'No' } " )
179
+
180
+ if self .ws_last_pong :
181
+ last_pong_ago = int (time .time () - self .ws_last_pong )
182
+ print (f"Last Pong: { last_pong_ago } s ago" )
183
+
184
+ # Show recent errors
185
+ if self .error_history :
186
+ print ("\n Recent Errors:" )
187
+ for timestamp , error in list (self .error_history )[- 3 :]:
188
+ print (f" [{ timestamp .strftime ('%H:%M:%S' )} ] { error [:50 ]} " )
189
+
190
+ print ("=" * 60 )
191
+
192
+ # Health score
193
+ health_score = 0
194
+ if self .ws_connected : health_score += 40
195
+ if self .mcp_healthy : health_score += 30
196
+ if ping_success_rate > 90 : health_score += 20
197
+ if self .consecutive_failures == 0 : health_score += 10
198
+
199
+ if health_score >= 90 :
200
+ self .log ("SUCCESS" , f"System Health: EXCELLENT ({ health_score } %)" )
201
+ elif health_score >= 70 :
202
+ self .log ("INFO" , f"System Health: GOOD ({ health_score } %)" )
203
+ elif health_score >= 50 :
204
+ self .log ("WARNING" , f"System Health: DEGRADED ({ health_score } %)" )
205
+ else :
206
+ self .log ("ERROR" , f"System Health: CRITICAL ({ health_score } %)" )
207
+
208
+ async def run (self ):
209
+ """Run all monitoring tasks concurrently"""
210
+ self .log ("INFO" , "Starting Bridge Health Monitor v1.0" )
211
+ self .log ("INFO" , f"Monitoring WebSocket: { self .ws_url } " )
212
+ self .log ("INFO" , f"Monitoring MCP: { self .mcp_health_url } " )
213
+ print ("=" * 60 )
214
+
215
+ # Run monitoring tasks
216
+ tasks = [
217
+ asyncio .create_task (self .monitor_websocket ()),
218
+ asyncio .create_task (self .display_dashboard ())
219
+ ]
220
+
221
+ try :
222
+ await asyncio .gather (* tasks )
223
+ except KeyboardInterrupt :
224
+ self .log ("INFO" , "Shutting down monitor..." )
225
+ for task in tasks :
226
+ task .cancel ()
227
+
228
+ async def main ():
229
+ monitor = BridgeHealthMonitor ()
230
+ await monitor .run ()
231
+
232
+ if __name__ == "__main__" :
233
+ try :
234
+ asyncio .run (main ())
235
+ except KeyboardInterrupt :
236
+ print ("\n 👋 Monitor stopped" )
237
+ sys .exit (0 )
0 commit comments