Files
intercept/utils/process_monitor.py
Smittix ecdc060d81 Add HackRF support to TSCM RF scan and misc improvements
TSCM RF scan now auto-detects HackRF via SDRFactory and uses
hackrf_sweep as an alternative to rtl_power. Also includes
improvements to listening post, rtlamr, weather satellite,
SubGHz, Meshtastic, SSTV, WeFax, and process monitor modules.

Fixes #154

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 20:58:57 +00:00

216 lines
7.4 KiB
Python

"""
Process health monitoring and auto-restart functionality.
"""
from __future__ import annotations
import logging
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime
from typing import Callable, Dict, Optional, Any
logger = logging.getLogger('intercept.process_monitor')
@dataclass
class ProcessInfo:
"""Information about a monitored process."""
name: str
process: Any # subprocess.Popen
started_at: datetime = field(default_factory=datetime.now)
restart_count: int = 0
last_restart: Optional[datetime] = None
restart_callback: Optional[Callable] = None
max_restarts: int = 3
backoff_seconds: float = 5.0
enabled: bool = True
class ProcessMonitor:
"""
Monitor and auto-restart processes.
Usage:
monitor = ProcessMonitor()
monitor.register('pager', process, restart_callback=start_pager)
monitor.start()
"""
def __init__(self, check_interval: float = 5.0):
self.processes: Dict[str, ProcessInfo] = {}
self.check_interval = check_interval
self._running = False
self._thread: Optional[threading.Thread] = None
self._lock = threading.Lock()
def register(
self,
name: str,
process: Any,
restart_callback: Optional[Callable] = None,
max_restarts: int = 3,
backoff_seconds: float = 5.0
) -> None:
"""
Register a process for monitoring.
Args:
name: Unique name for the process
process: The subprocess.Popen object
restart_callback: Function to call to restart the process
max_restarts: Maximum number of automatic restarts
backoff_seconds: Base backoff time between restarts
"""
with self._lock:
self.processes[name] = ProcessInfo(
name=name,
process=process,
restart_callback=restart_callback,
max_restarts=max_restarts,
backoff_seconds=backoff_seconds
)
logger.info(f"Registered process for monitoring: {name}")
def unregister(self, name: str) -> None:
"""Remove a process from monitoring."""
with self._lock:
if name in self.processes:
del self.processes[name]
logger.info(f"Unregistered process: {name}")
def update_process(self, name: str, process: Any) -> None:
"""Update the process object for a registered name."""
with self._lock:
if name in self.processes:
self.processes[name].process = process
self.processes[name].started_at = datetime.now()
def start(self) -> None:
"""Start the monitoring thread."""
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
logger.info("Process monitor started")
def stop(self) -> None:
"""Stop the monitoring thread."""
self._running = False
if self._thread:
self._thread.join(timeout=self.check_interval + 1)
logger.info("Process monitor stopped")
def _monitor_loop(self) -> None:
"""Main monitoring loop."""
while self._running:
self._check_all_processes()
time.sleep(self.check_interval)
def _check_all_processes(self) -> None:
"""Check health of all registered processes."""
# Collect crashed processes under lock, handle restarts outside
crashed: list[tuple[str, ProcessInfo]] = []
with self._lock:
for name, info in list(self.processes.items()):
if not info.enabled:
continue
if info.process is None:
continue
# Check if process has terminated
return_code = info.process.poll()
if return_code is not None:
logger.warning(
f"Process '{name}' terminated with code {return_code}"
)
crashed.append((name, info))
# Handle restarts outside lock (involves sleeps and callbacks)
for name, info in crashed:
self._handle_crash(name, info)
def _handle_crash(self, name: str, info: ProcessInfo) -> None:
"""Handle a crashed process. Must be called WITHOUT holding self._lock."""
if info.restart_callback is None:
logger.info(f"No restart callback for '{name}', skipping auto-restart")
return
if info.restart_count >= info.max_restarts:
logger.error(
f"Process '{name}' exceeded max restarts ({info.max_restarts}), "
"disabling auto-restart"
)
with self._lock:
info.enabled = False
return
# Calculate backoff with exponential increase
backoff = info.backoff_seconds * (2 ** info.restart_count)
logger.info(
f"Attempting to restart '{name}' in {backoff:.1f}s "
f"(attempt {info.restart_count + 1}/{info.max_restarts})"
)
# Wait for backoff period outside lock
time.sleep(backoff)
# Attempt restart
try:
info.restart_callback()
with self._lock:
info.restart_count += 1
info.last_restart = datetime.now()
logger.info(f"Successfully restarted '{name}'")
except Exception as e:
logger.error(f"Failed to restart '{name}': {e}")
with self._lock:
info.restart_count += 1
def get_status(self) -> Dict[str, Any]:
"""
Get status of all monitored processes.
Returns:
Dict with process status information
"""
with self._lock:
status = {}
for name, info in self.processes.items():
is_running = (
info.process is not None and
info.process.poll() is None
)
status[name] = {
'running': is_running,
'started_at': info.started_at.isoformat() if info.started_at else None,
'restart_count': info.restart_count,
'last_restart': info.last_restart.isoformat() if info.last_restart else None,
'auto_restart_enabled': info.enabled,
'return_code': info.process.poll() if info.process else None
}
return status
def reset_restart_count(self, name: str) -> None:
"""Reset the restart count for a process (e.g., after manual restart)."""
with self._lock:
if name in self.processes:
self.processes[name].restart_count = 0
self.processes[name].enabled = True
def is_healthy(self) -> bool:
"""Check if all processes are healthy."""
with self._lock:
for info in self.processes.values():
if info.process is not None and info.process.poll() is not None:
return False
return True
# Global monitor instance
process_monitor = ProcessMonitor()