""" Process health monitoring and auto-restart functionality. """ from __future__ import annotations import logging import threading import time from dataclasses import dataclass, field from datetime import datetime from typing import Any, Callable logger = logging.getLogger('intercept.process_monitor') @dataclass class ProcessInfo: """Information about a monitored process.""" name: str process: Any # subprocess.Popen started_at: datetime = field(default_factory=datetime.now) restart_count: int = 0 last_restart: datetime | None = None restart_callback: Callable | None = None max_restarts: int = 3 backoff_seconds: float = 5.0 enabled: bool = True class ProcessMonitor: """ Monitor and auto-restart processes. Usage: monitor = ProcessMonitor() monitor.register('pager', process, restart_callback=start_pager) monitor.start() """ def __init__(self, check_interval: float = 5.0): self.processes: dict[str, ProcessInfo] = {} self.check_interval = check_interval self._running = False self._thread: threading.Thread | None = None self._lock = threading.Lock() def register( self, name: str, process: Any, restart_callback: Callable | None = None, max_restarts: int = 3, backoff_seconds: float = 5.0 ) -> None: """ Register a process for monitoring. Args: name: Unique name for the process process: The subprocess.Popen object restart_callback: Function to call to restart the process max_restarts: Maximum number of automatic restarts backoff_seconds: Base backoff time between restarts """ with self._lock: self.processes[name] = ProcessInfo( name=name, process=process, restart_callback=restart_callback, max_restarts=max_restarts, backoff_seconds=backoff_seconds ) logger.info(f"Registered process for monitoring: {name}") def unregister(self, name: str) -> None: """Remove a process from monitoring.""" with self._lock: if name in self.processes: del self.processes[name] logger.info(f"Unregistered process: {name}") def update_process(self, name: str, process: Any) -> None: """Update the process object for a registered name.""" with self._lock: if name in self.processes: self.processes[name].process = process self.processes[name].started_at = datetime.now() def start(self) -> None: """Start the monitoring thread.""" if self._running: return self._running = True self._thread = threading.Thread(target=self._monitor_loop, daemon=True) self._thread.start() logger.info("Process monitor started") def stop(self) -> None: """Stop the monitoring thread.""" self._running = False if self._thread: self._thread.join(timeout=self.check_interval + 1) logger.info("Process monitor stopped") def _monitor_loop(self) -> None: """Main monitoring loop.""" while self._running: self._check_all_processes() time.sleep(self.check_interval) def _check_all_processes(self) -> None: """Check health of all registered processes.""" # Collect crashed processes under lock, handle restarts outside crashed: list[tuple[str, ProcessInfo]] = [] with self._lock: for name, info in list(self.processes.items()): if not info.enabled: continue if info.process is None: continue # Check if process has terminated return_code = info.process.poll() if return_code is not None: logger.warning( f"Process '{name}' terminated with code {return_code}" ) crashed.append((name, info)) # Handle restarts outside lock (involves sleeps and callbacks) for name, info in crashed: self._handle_crash(name, info) def _handle_crash(self, name: str, info: ProcessInfo) -> None: """Handle a crashed process. Must be called WITHOUT holding self._lock.""" if info.restart_callback is None: logger.info(f"No restart callback for '{name}', skipping auto-restart") return if info.restart_count >= info.max_restarts: logger.error( f"Process '{name}' exceeded max restarts ({info.max_restarts}), " "disabling auto-restart" ) with self._lock: info.enabled = False return # Calculate backoff with exponential increase backoff = info.backoff_seconds * (2 ** info.restart_count) logger.info( f"Attempting to restart '{name}' in {backoff:.1f}s " f"(attempt {info.restart_count + 1}/{info.max_restarts})" ) # Wait for backoff period outside lock time.sleep(backoff) # Attempt restart try: info.restart_callback() with self._lock: info.restart_count += 1 info.last_restart = datetime.now() logger.info(f"Successfully restarted '{name}'") except Exception as e: logger.error(f"Failed to restart '{name}': {e}") with self._lock: info.restart_count += 1 def get_status(self) -> dict[str, Any]: """ Get status of all monitored processes. Returns: Dict with process status information """ with self._lock: status = {} for name, info in self.processes.items(): is_running = ( info.process is not None and info.process.poll() is None ) status[name] = { 'running': is_running, 'started_at': info.started_at.isoformat() if info.started_at else None, 'restart_count': info.restart_count, 'last_restart': info.last_restart.isoformat() if info.last_restart else None, 'auto_restart_enabled': info.enabled, 'return_code': info.process.poll() if info.process else None } return status def reset_restart_count(self, name: str) -> None: """Reset the restart count for a process (e.g., after manual restart).""" with self._lock: if name in self.processes: self.processes[name].restart_count = 0 self.processes[name].enabled = True def is_healthy(self) -> bool: """Check if all processes are healthy.""" with self._lock: for info in self.processes.values(): if info.process is not None and info.process.poll() is not None: return False return True # Global monitor instance process_monitor = ProcessMonitor()