Files
rigdoctor/src/rigdoctor/core/alerts.py
T
jessey 2ee7763d00
tests / core (push) Successful in 12s
tests / gui-smoke (push) Successful in 27s
tests / core (pull_request) Successful in 12s
tests / gui-smoke (pull_request) Successful in 26s
feat(m8): event-based alerts — Xid/OOM/MCE/PCIe/disk from the kernel log — 0.34.0
AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires
one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM
kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned
the moment something goes wrong, not only on a temperature threshold. Disk I/O
errors come from the kernel log (no root needed, unlike smartctl). Edge/spam
protection reuses the existing cooldown model. syslogs.scan_critical() does the
matching; init seeds last-scan to "now" so old boot logs don't alert on launch.
Tests for the matcher + monitor gating/cooldown; Settings note updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 14:41:13 +02:00

146 lines
5.5 KiB
Python

"""Desktop alerts (M8): notify on overheat / GPU-lost / critical kernel events / new version.
Edge-triggered: a sustained condition (hot GPU, GPU-lost) fires once when it becomes true and
can re-fire only after it clears + a cooldown; momentary **kernel events** (Xid, OOM-kill, MCE,
PCIe AER, disk I/O errors) are scanned from the kernel log every `event_interval` seconds and
fire one-shot (cooldown-gated). So a 1-Hz sample loop never spams. No-op if notify-send absent.
"""
from __future__ import annotations
import shutil
import subprocess
import time
from pathlib import Path
from ..config import DATA_DIR
from .sample import Sample
APP_NAME = "RigDoctor"
_STOCK_ICON = "utilities-system-monitor"
# The RigDoctor icon, so notifications match the app/dock icon. Prefer the copy that
# desktop integration installs into the icon theme (~/.local/share/icons/...); fall back to
# the bundled asset for source/dev runs, then to a stock icon if neither is present.
_INSTALLED_ICON = DATA_DIR.parent / "icons" / "hicolor" / "scalable" / "apps" / "rigdoctor.svg"
_BUNDLED_ICON = Path(__file__).parents[1] / "gui" / "assets" / "rigdoctor.svg"
def available() -> bool:
return shutil.which("notify-send") is not None
def _icon() -> str:
"""Resolve the notification icon at call time (the themed copy may be installed late)."""
for path in (_INSTALLED_ICON, _BUNDLED_ICON):
try:
if path.exists():
return str(path)
except OSError:
pass
return _STOCK_ICON
def notify(title: str, message: str, urgency: str = "normal") -> bool:
"""Send a desktop notification (best-effort). urgency: low|normal|critical."""
if not available():
return False
try:
subprocess.run(
["notify-send", "-a", APP_NAME, "-u", urgency, "-i", _icon(), title, message],
timeout=10,
check=False,
)
return True
except (subprocess.SubprocessError, OSError):
return False
class AlertMonitor:
"""Evaluate samples and raise edge-triggered desktop alerts."""
def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0,
event_interval: float = 30.0):
self.gpu_temp = gpu_temp
self.cpu_temp = cpu_temp
self.cooldown = cooldown
self.event_interval = event_interval # how often to scan the kernel log
self.enabled = True
self._active: dict[str, bool] = {}
self._last: dict[str, float] = {}
self._last_kernel_scan = time.time() # only alert on events after the monitor starts
def _fire(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
if self._active.get(key):
return # already alerting; wait until it clears
now = time.time()
if now - self._last.get(key, 0.0) < self.cooldown:
return
self._active[key] = True
self._last[key] = now
notify(title, message, urgency)
def _notify_once(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
"""One-shot alert for a momentary event (cooldown-gated, no active latch)."""
now = time.time()
if now - self._last.get(key, 0.0) < self.cooldown:
return
self._last[key] = now
notify(title, message, urgency)
def _clear(self, key: str) -> None:
self._active[key] = False
def _scan_kernel_events(self) -> None:
"""Periodically scan the kernel log for new critical events (Xid/OOM/MCE/PCIe/disk)."""
now = time.time()
if now - self._last_kernel_scan < self.event_interval:
return
since = self._last_kernel_scan
self._last_kernel_scan = now
try:
from . import syslogs
text = syslogs.kernel_log(since=since)
except Exception: # alerting must never crash the sample loop
return
if not text:
return
seen: set[str] = set()
for label, line in syslogs.scan_critical(text):
if label in seen: # one alert per category per scan
continue
seen.add(label)
self._notify_once(f"kernel:{label}", label, line[:180])
def check(self, sample: Sample) -> None:
if not self.enabled:
return
gpu_t = next(
(r.value for r in sample.readings
if r.source == "gpu" and r.metric == "temp" and r.label == "" and r.value is not None),
None,
)
if gpu_t is not None:
if gpu_t >= self.gpu_temp:
self._fire("gpu_temp", "GPU overheating", f"GPU at {gpu_t:.0f} °C")
else:
self._clear("gpu_temp")
cpu_temps = [r.value for r in sample.readings
if r.source == "cpu" and r.metric == "temp" and r.value is not None]
if cpu_temps:
cpu_t = max(cpu_temps)
if cpu_t >= self.cpu_temp:
self._fire("cpu_temp", "CPU overheating", f"CPU at {cpu_t:.0f} °C")
else:
self._clear("cpu_temp")
lost = any(r.source == "gpu" and r.metric == "status" and r.label == "query-timeout"
for r in sample.readings)
if lost:
self._fire("gpu_lost", "GPU not responding", "nvidia-smi query timed out — the GPU may have dropped")
else:
self._clear("gpu_lost")
self._scan_kernel_events() # Xid / OOM / MCE / PCIe / disk I/O from the kernel log