Shorten agent health checks on load

2026-04-24 06:40:00 -07:00 · 2026-03-19 08:09:07 +00:00
parent db0a26cd64
commit fb8b6a01e8
2 changed files with 80 additions and 53 deletions
--- a/routes/controller.py
+++ b/routes/controller.py
@@ -40,14 +40,16 @@ from utils.trilateration import (
    estimate_location_from_observations,
 )
-logger = logging.getLogger('intercept.controller')
+logger = logging.getLogger('intercept.controller')
-
+
-controller_bp = Blueprint('controller', __name__, url_prefix='/controller')
+controller_bp = Blueprint('controller', __name__, url_prefix='/controller')
-
+AGENT_HEALTH_TIMEOUT_SECONDS = 2.0
-# Multi-agent SSE fanout state (per-client queues).
+AGENT_STATUS_TIMEOUT_SECONDS = 2.5
-_agent_stream_subscribers: set[queue.Queue] = set()
+
-_agent_stream_subscribers_lock = threading.Lock()
+# Multi-agent SSE fanout state (per-client queues).
-_AGENT_STREAM_CLIENT_QUEUE_SIZE = 500
+_agent_stream_subscribers: set[queue.Queue] = set()
 _agent_stream_subscribers_lock = threading.Lock()
 _AGENT_STREAM_CLIENT_QUEUE_SIZE = 500
 def _broadcast_agent_data(payload: dict) -> None:
@@ -77,14 +79,18 @@ def get_agents():
    agents = list_agents(active_only=active_only)
    # Optionally refresh status for each agent
-    refresh = request.args.get('refresh', 'false').lower() == 'true'
+    refresh = request.args.get('refresh', 'false').lower() == 'true'
-    if refresh:
+    if refresh:
-        for agent in agents:
+        for agent in agents:
-            try:
+            try:
-                client = create_client_from_agent(agent)
+                client = AgentClient(
-                agent['healthy'] = client.health_check()
+                    agent['base_url'],
-            except Exception:
+                    api_key=agent.get('api_key'),
-                agent['healthy'] = False
+                    timeout=AGENT_HEALTH_TIMEOUT_SECONDS,
                )
                agent['healthy'] = client.health_check()
            except Exception:
                agent['healthy'] = False
    return jsonify({
        'status': 'success',
@@ -327,27 +333,36 @@ def check_all_agents_health():
            'error': None
        }
-        try:
+        try:
-            client = create_client_from_agent(agent)
+            client = AgentClient(
-
+                agent['base_url'],
-            # Time the health check
+                api_key=agent.get('api_key'),
-            start_time = time.time()
+                timeout=AGENT_HEALTH_TIMEOUT_SECONDS,
-            is_healthy = client.health_check()
+            )
-            response_time = (time.time() - start_time) * 1000
+
            # Time the health check
            start_time = time.time()
            is_healthy = client.health_check()
            response_time = (time.time() - start_time) * 1000
            result['healthy'] = is_healthy
            result['response_time_ms'] = round(response_time, 1)
            if is_healthy:
-                # Update last_seen in database
+                # Update last_seen in database
-                update_agent(agent['id'], update_last_seen=True)
+                update_agent(agent['id'], update_last_seen=True)
-
+
-                # Also fetch running modes
+                # Also fetch running modes
-                try:
+                try:
-                    status = client.get_status()
+                    status_client = AgentClient(
-                    result['running_modes'] = status.get('running_modes', [])
+                        agent['base_url'],
-                    result['running_modes_detail'] = status.get('running_modes_detail', {})
+                        api_key=agent.get('api_key'),
-                except Exception:
+                        timeout=AGENT_STATUS_TIMEOUT_SECONDS,
                    )
                    status = status_client.get_status()
                    result['running_modes'] = status.get('running_modes', [])
                    result['running_modes_detail'] = status.get('running_modes_detail', {})
                except Exception:
                    pass  # Status fetch is optional
        except AgentConnectionError as e:
--- a/static/js/core/agents.js
+++ b/static/js/core/agents.js
@@ -10,10 +10,11 @@ let currentAgent = 'local';
 let agentEventSource = null;
 let multiAgentMode = false;  // Show combined results from all agents
 let multiAgentPollInterval = null;
-let agentRunningModes = [];  // Track agent's running modes for conflict detection
+let agentRunningModes = [];  // Track agent's running modes for conflict detection
-let agentRunningModesDetail = {};  // Track device info per mode (for multi-SDR agents)
+let agentRunningModesDetail = {};  // Track device info per mode (for multi-SDR agents)
-let healthCheckInterval = null;  // Health monitoring interval
+let healthCheckInterval = null;  // Health monitoring interval
-let agentHealthStatus = {};  // Cache of health status per agent ID
+let agentHealthStatus = {};  // Cache of health status per agent ID
 let healthCheckKickoffTimer = null;
 // ============== AGENT HEALTH MONITORING ==============
@@ -21,27 +22,38 @@ let agentHealthStatus = {};  // Cache of health status per agent ID
 * Start periodic health monitoring for all agents.
 * Runs every 30 seconds to check agent health status.
 */
-function startHealthMonitoring() {
+function startHealthMonitoring() {
-    // Don't start if already running
+    // Don't start if already running
-    if (healthCheckInterval) return;
+    if (healthCheckInterval) return;
-
+
-    // Initial check
+    // Defer the first probe so heavy dashboards can finish initial render
-    checkAllAgentsHealth();
+    // before we start contacting remote agents.
-
+    if (healthCheckKickoffTimer) {
-    // Start periodic checks every 30 seconds
+        clearTimeout(healthCheckKickoffTimer);
-    healthCheckInterval = setInterval(checkAllAgentsHealth, 30000);
+    }
-    console.log('[AgentManager] Health monitoring started (30s interval)');
+    healthCheckKickoffTimer = setTimeout(() => {
-}
+        healthCheckKickoffTimer = null;
        checkAllAgentsHealth();
    }, 5000);
    // Start periodic checks every 30 seconds
    healthCheckInterval = setInterval(checkAllAgentsHealth, 30000);
    console.log('[AgentManager] Health monitoring started (30s interval)');
 }
 /**
 * Stop health monitoring.
 */
-function stopHealthMonitoring() {
+function stopHealthMonitoring() {
-    if (healthCheckInterval) {
+    if (healthCheckKickoffTimer) {
-        clearInterval(healthCheckInterval);
+        clearTimeout(healthCheckKickoffTimer);
-        healthCheckInterval = null;
+        healthCheckKickoffTimer = null;
-        console.log('[AgentManager] Health monitoring stopped');
+    }
-    }
+    if (healthCheckInterval) {
        clearInterval(healthCheckInterval);
        healthCheckInterval = null;
        console.log('[AgentManager] Health monitoring stopped');
    }
 }
 /**