2ee7763d00
AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned the moment something goes wrong, not only on a temperature threshold. Disk I/O errors come from the kernel log (no root needed, unlike smartctl). Edge/spam protection reuses the existing cooldown model. syslogs.scan_critical() does the matching; init seeds last-scan to "now" so old boot logs don't alert on launch. Tests for the matcher + monitor gating/cooldown; Settings note updated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
146 lines
5.5 KiB
Python
146 lines
5.5 KiB
Python
"""Desktop alerts (M8): notify on overheat / GPU-lost / critical kernel events / new version.
|
|
|
|
Edge-triggered: a sustained condition (hot GPU, GPU-lost) fires once when it becomes true and
|
|
can re-fire only after it clears + a cooldown; momentary **kernel events** (Xid, OOM-kill, MCE,
|
|
PCIe AER, disk I/O errors) are scanned from the kernel log every `event_interval` seconds and
|
|
fire one-shot (cooldown-gated). So a 1-Hz sample loop never spams. No-op if notify-send absent.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import shutil
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from ..config import DATA_DIR
|
|
from .sample import Sample
|
|
|
|
APP_NAME = "RigDoctor"
|
|
_STOCK_ICON = "utilities-system-monitor"
|
|
# The RigDoctor icon, so notifications match the app/dock icon. Prefer the copy that
|
|
# desktop integration installs into the icon theme (~/.local/share/icons/...); fall back to
|
|
# the bundled asset for source/dev runs, then to a stock icon if neither is present.
|
|
_INSTALLED_ICON = DATA_DIR.parent / "icons" / "hicolor" / "scalable" / "apps" / "rigdoctor.svg"
|
|
_BUNDLED_ICON = Path(__file__).parents[1] / "gui" / "assets" / "rigdoctor.svg"
|
|
|
|
|
|
def available() -> bool:
|
|
return shutil.which("notify-send") is not None
|
|
|
|
|
|
def _icon() -> str:
|
|
"""Resolve the notification icon at call time (the themed copy may be installed late)."""
|
|
for path in (_INSTALLED_ICON, _BUNDLED_ICON):
|
|
try:
|
|
if path.exists():
|
|
return str(path)
|
|
except OSError:
|
|
pass
|
|
return _STOCK_ICON
|
|
|
|
|
|
def notify(title: str, message: str, urgency: str = "normal") -> bool:
|
|
"""Send a desktop notification (best-effort). urgency: low|normal|critical."""
|
|
if not available():
|
|
return False
|
|
try:
|
|
subprocess.run(
|
|
["notify-send", "-a", APP_NAME, "-u", urgency, "-i", _icon(), title, message],
|
|
timeout=10,
|
|
check=False,
|
|
)
|
|
return True
|
|
except (subprocess.SubprocessError, OSError):
|
|
return False
|
|
|
|
|
|
class AlertMonitor:
|
|
"""Evaluate samples and raise edge-triggered desktop alerts."""
|
|
|
|
def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0,
|
|
event_interval: float = 30.0):
|
|
self.gpu_temp = gpu_temp
|
|
self.cpu_temp = cpu_temp
|
|
self.cooldown = cooldown
|
|
self.event_interval = event_interval # how often to scan the kernel log
|
|
self.enabled = True
|
|
self._active: dict[str, bool] = {}
|
|
self._last: dict[str, float] = {}
|
|
self._last_kernel_scan = time.time() # only alert on events after the monitor starts
|
|
|
|
def _fire(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
|
|
if self._active.get(key):
|
|
return # already alerting; wait until it clears
|
|
now = time.time()
|
|
if now - self._last.get(key, 0.0) < self.cooldown:
|
|
return
|
|
self._active[key] = True
|
|
self._last[key] = now
|
|
notify(title, message, urgency)
|
|
|
|
def _notify_once(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
|
|
"""One-shot alert for a momentary event (cooldown-gated, no active latch)."""
|
|
now = time.time()
|
|
if now - self._last.get(key, 0.0) < self.cooldown:
|
|
return
|
|
self._last[key] = now
|
|
notify(title, message, urgency)
|
|
|
|
def _clear(self, key: str) -> None:
|
|
self._active[key] = False
|
|
|
|
def _scan_kernel_events(self) -> None:
|
|
"""Periodically scan the kernel log for new critical events (Xid/OOM/MCE/PCIe/disk)."""
|
|
now = time.time()
|
|
if now - self._last_kernel_scan < self.event_interval:
|
|
return
|
|
since = self._last_kernel_scan
|
|
self._last_kernel_scan = now
|
|
try:
|
|
from . import syslogs
|
|
|
|
text = syslogs.kernel_log(since=since)
|
|
except Exception: # alerting must never crash the sample loop
|
|
return
|
|
if not text:
|
|
return
|
|
seen: set[str] = set()
|
|
for label, line in syslogs.scan_critical(text):
|
|
if label in seen: # one alert per category per scan
|
|
continue
|
|
seen.add(label)
|
|
self._notify_once(f"kernel:{label}", label, line[:180])
|
|
|
|
def check(self, sample: Sample) -> None:
|
|
if not self.enabled:
|
|
return
|
|
gpu_t = next(
|
|
(r.value for r in sample.readings
|
|
if r.source == "gpu" and r.metric == "temp" and r.label == "" and r.value is not None),
|
|
None,
|
|
)
|
|
if gpu_t is not None:
|
|
if gpu_t >= self.gpu_temp:
|
|
self._fire("gpu_temp", "GPU overheating", f"GPU at {gpu_t:.0f} °C")
|
|
else:
|
|
self._clear("gpu_temp")
|
|
|
|
cpu_temps = [r.value for r in sample.readings
|
|
if r.source == "cpu" and r.metric == "temp" and r.value is not None]
|
|
if cpu_temps:
|
|
cpu_t = max(cpu_temps)
|
|
if cpu_t >= self.cpu_temp:
|
|
self._fire("cpu_temp", "CPU overheating", f"CPU at {cpu_t:.0f} °C")
|
|
else:
|
|
self._clear("cpu_temp")
|
|
|
|
lost = any(r.source == "gpu" and r.metric == "status" and r.label == "query-timeout"
|
|
for r in sample.readings)
|
|
if lost:
|
|
self._fire("gpu_lost", "GPU not responding", "nvidia-smi query timed out — the GPU may have dropped")
|
|
else:
|
|
self._clear("gpu_lost")
|
|
|
|
self._scan_kernel_events() # Xid / OOM / MCE / PCIe / disk I/O from the kernel log
|