feat(m8): event-based alerts — Xid/OOM/MCE/PCIe/disk from the kernel log — 0.34.0 #28
@@ -5,6 +5,14 @@ All notable changes to RigDoctor are recorded here. Format follows
|
|||||||
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
||||||
release tag (so the auto-updater, D18, can compare versions).
|
release tag (so the auto-updater, D18, can compare versions).
|
||||||
|
|
||||||
|
## [0.34.0] - 2026-05-22
|
||||||
|
### Added
|
||||||
|
- **Event-based alerts (M8).** Beyond temperature + GPU-lost, RigDoctor now notifies on
|
||||||
|
**critical kernel events** — Xid (GPU error), out-of-memory kills, CPU machine-checks, PCIe
|
||||||
|
AER errors, and disk I/O errors — scanned from the kernel log every ~30s while monitoring and
|
||||||
|
fired one-shot (cooldown-gated, so no spam). A proactive warning the moment something goes
|
||||||
|
wrong, not just on a temperature threshold. Included whenever desktop notifications are on.
|
||||||
|
|
||||||
## [0.33.0] - 2026-05-22
|
## [0.33.0] - 2026-05-22
|
||||||
### Added
|
### Added
|
||||||
- **AI explanations stream live.** "Explain with AI" now fills token-by-token as the model
|
- **AI explanations stream live.** "Explain with AI" now fills token-by-token as the model
|
||||||
|
|||||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "rigdoctor"
|
name = "rigdoctor"
|
||||||
version = "0.33.0"
|
version = "0.34.0"
|
||||||
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
||||||
|
|
||||||
__version__ = "0.33.0"
|
__version__ = "0.34.0"
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
"""Desktop alerts (M8): notify on overheat / GPU-lost / new version via notify-send.
|
"""Desktop alerts (M8): notify on overheat / GPU-lost / critical kernel events / new version.
|
||||||
|
|
||||||
Edge-triggered: an alert fires when a condition becomes true (not every sample), and
|
Edge-triggered: a sustained condition (hot GPU, GPU-lost) fires once when it becomes true and
|
||||||
can fire again only after it has cleared and a cooldown has passed — so a hot GPU or a
|
can re-fire only after it clears + a cooldown; momentary **kernel events** (Xid, OOM-kill, MCE,
|
||||||
1-Hz sample loop doesn't spam notifications. Degrades to a no-op if notify-send is absent.
|
PCIe AER, disk I/O errors) are scanned from the kernel log every `event_interval` seconds and
|
||||||
|
fire one-shot (cooldown-gated). So a 1-Hz sample loop never spams. No-op if notify-send absent.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -57,13 +58,16 @@ def notify(title: str, message: str, urgency: str = "normal") -> bool:
|
|||||||
class AlertMonitor:
|
class AlertMonitor:
|
||||||
"""Evaluate samples and raise edge-triggered desktop alerts."""
|
"""Evaluate samples and raise edge-triggered desktop alerts."""
|
||||||
|
|
||||||
def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0):
|
def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0,
|
||||||
|
event_interval: float = 30.0):
|
||||||
self.gpu_temp = gpu_temp
|
self.gpu_temp = gpu_temp
|
||||||
self.cpu_temp = cpu_temp
|
self.cpu_temp = cpu_temp
|
||||||
self.cooldown = cooldown
|
self.cooldown = cooldown
|
||||||
|
self.event_interval = event_interval # how often to scan the kernel log
|
||||||
self.enabled = True
|
self.enabled = True
|
||||||
self._active: dict[str, bool] = {}
|
self._active: dict[str, bool] = {}
|
||||||
self._last: dict[str, float] = {}
|
self._last: dict[str, float] = {}
|
||||||
|
self._last_kernel_scan = time.time() # only alert on events after the monitor starts
|
||||||
|
|
||||||
def _fire(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
|
def _fire(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
|
||||||
if self._active.get(key):
|
if self._active.get(key):
|
||||||
@@ -75,9 +79,39 @@ class AlertMonitor:
|
|||||||
self._last[key] = now
|
self._last[key] = now
|
||||||
notify(title, message, urgency)
|
notify(title, message, urgency)
|
||||||
|
|
||||||
|
def _notify_once(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
|
||||||
|
"""One-shot alert for a momentary event (cooldown-gated, no active latch)."""
|
||||||
|
now = time.time()
|
||||||
|
if now - self._last.get(key, 0.0) < self.cooldown:
|
||||||
|
return
|
||||||
|
self._last[key] = now
|
||||||
|
notify(title, message, urgency)
|
||||||
|
|
||||||
def _clear(self, key: str) -> None:
|
def _clear(self, key: str) -> None:
|
||||||
self._active[key] = False
|
self._active[key] = False
|
||||||
|
|
||||||
|
def _scan_kernel_events(self) -> None:
|
||||||
|
"""Periodically scan the kernel log for new critical events (Xid/OOM/MCE/PCIe/disk)."""
|
||||||
|
now = time.time()
|
||||||
|
if now - self._last_kernel_scan < self.event_interval:
|
||||||
|
return
|
||||||
|
since = self._last_kernel_scan
|
||||||
|
self._last_kernel_scan = now
|
||||||
|
try:
|
||||||
|
from . import syslogs
|
||||||
|
|
||||||
|
text = syslogs.kernel_log(since=since)
|
||||||
|
except Exception: # alerting must never crash the sample loop
|
||||||
|
return
|
||||||
|
if not text:
|
||||||
|
return
|
||||||
|
seen: set[str] = set()
|
||||||
|
for label, line in syslogs.scan_critical(text):
|
||||||
|
if label in seen: # one alert per category per scan
|
||||||
|
continue
|
||||||
|
seen.add(label)
|
||||||
|
self._notify_once(f"kernel:{label}", label, line[:180])
|
||||||
|
|
||||||
def check(self, sample: Sample) -> None:
|
def check(self, sample: Sample) -> None:
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
return
|
return
|
||||||
@@ -107,3 +141,5 @@ class AlertMonitor:
|
|||||||
self._fire("gpu_lost", "GPU not responding", "nvidia-smi query timed out — the GPU may have dropped")
|
self._fire("gpu_lost", "GPU not responding", "nvidia-smi query timed out — the GPU may have dropped")
|
||||||
else:
|
else:
|
||||||
self._clear("gpu_lost")
|
self._clear("gpu_lost")
|
||||||
|
|
||||||
|
self._scan_kernel_events() # Xid / OOM / MCE / PCIe / disk I/O from the kernel log
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ Best-effort and size-bounded: degrades silently if a tool is missing or access i
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
@@ -118,6 +119,29 @@ def display_log(since: float | None = None, max_bytes: int = _MAX) -> str:
|
|||||||
return _tail_file(log, max_bytes) if log else ""
|
return _tail_file(log, max_bytes) if log else ""
|
||||||
|
|
||||||
|
|
||||||
|
# Kernel-log patterns worth alerting on in real time (M8 event alerts). (label, regex).
|
||||||
|
_CRITICAL = [
|
||||||
|
("GPU error (Xid)", re.compile(r"NVRM:\s*Xid", re.I)),
|
||||||
|
("Out of memory", re.compile(r"out of memory|oom-kill|killed process \d+", re.I)),
|
||||||
|
("CPU machine-check", re.compile(r"\bmce:|machine check", re.I)),
|
||||||
|
("PCIe error", re.compile(r"\bAER:|pcie bus error", re.I)),
|
||||||
|
("Disk I/O error", re.compile(
|
||||||
|
r"buffer i/o error|\bi/o error\b|critical medium error|ext4-fs error|"
|
||||||
|
r"blk_update_request:.*error|ata\d+.*(?:failed|error)", re.I)),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def scan_critical(text: str) -> list[tuple[str, str]]:
|
||||||
|
"""(label, line) for kernel lines matching a critical pattern (first match per line)."""
|
||||||
|
events: list[tuple[str, str]] = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
for label, pat in _CRITICAL:
|
||||||
|
if pat.search(line):
|
||||||
|
events.append((label, line.strip()))
|
||||||
|
break
|
||||||
|
return events
|
||||||
|
|
||||||
|
|
||||||
def available() -> bool:
|
def available() -> bool:
|
||||||
return bool(shutil.which("journalctl") or shutil.which("coredumpctl")
|
return bool(shutil.which("journalctl") or shutil.which("coredumpctl")
|
||||||
or shutil.which("nvidia-smi") or _xorg_log())
|
or shutil.which("nvidia-smi") or _xorg_log())
|
||||||
|
|||||||
@@ -114,7 +114,8 @@ class SetupPage(QWidget):
|
|||||||
grid.addWidget(QLabel("CPU temperature alert"), 1, 0)
|
grid.addWidget(QLabel("CPU temperature alert"), 1, 0)
|
||||||
grid.addWidget(self._cpu_alert, 1, 1)
|
grid.addWidget(self._cpu_alert, 1, 1)
|
||||||
alerts_layout.addLayout(grid)
|
alerts_layout.addLayout(grid)
|
||||||
alerts_note = QLabel("GPU-lost and new-version alerts are included whenever notifications are enabled.")
|
alerts_note = QLabel("GPU-lost, critical kernel events (Xid, out-of-memory, disk I/O, PCIe), "
|
||||||
|
"and new-version alerts are included whenever notifications are enabled.")
|
||||||
alerts_note.setObjectName("Muted")
|
alerts_note.setObjectName("Muted")
|
||||||
alerts_note.setWordWrap(True)
|
alerts_note.setWordWrap(True)
|
||||||
alerts_layout.addWidget(alerts_note)
|
alerts_layout.addWidget(alerts_note)
|
||||||
|
|||||||
@@ -34,5 +34,35 @@ class AlertTests(unittest.TestCase):
|
|||||||
m.assert_called_once()
|
m.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class KernelEventAlertTests(unittest.TestCase):
|
||||||
|
@mock.patch.object(alerts, "notify")
|
||||||
|
def test_kernel_event_fires_once_within_cooldown(self, m):
|
||||||
|
mon = alerts.AlertMonitor(cooldown=300.0, event_interval=0.0)
|
||||||
|
mon._last_kernel_scan = 0.0 # force a scan
|
||||||
|
with mock.patch("rigdoctor.core.syslogs.kernel_log",
|
||||||
|
return_value="NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus"):
|
||||||
|
mon._scan_kernel_events()
|
||||||
|
mon._last_kernel_scan = 0.0 # force another scan — cooldown must suppress it
|
||||||
|
mon._scan_kernel_events()
|
||||||
|
self.assertEqual(m.call_count, 1)
|
||||||
|
self.assertIn("Xid", m.call_args[0][0])
|
||||||
|
|
||||||
|
@mock.patch.object(alerts, "notify")
|
||||||
|
def test_no_alert_when_kernel_log_empty(self, m):
|
||||||
|
mon = alerts.AlertMonitor(event_interval=0.0)
|
||||||
|
mon._last_kernel_scan = 0.0
|
||||||
|
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value=""):
|
||||||
|
mon._scan_kernel_events()
|
||||||
|
m.assert_not_called()
|
||||||
|
|
||||||
|
@mock.patch.object(alerts, "notify")
|
||||||
|
def test_scan_gated_by_interval(self, m):
|
||||||
|
mon = alerts.AlertMonitor(event_interval=9999.0) # just constructed → not due yet
|
||||||
|
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value="NVRM: Xid 79") as kl:
|
||||||
|
mon._scan_kernel_events()
|
||||||
|
kl.assert_not_called()
|
||||||
|
m.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -72,6 +72,25 @@ class DisplayTests(unittest.TestCase):
|
|||||||
self.assertTrue(any(a.startswith("_COMM=") for a in cmd))
|
self.assertTrue(any(a.startswith("_COMM=") for a in cmd))
|
||||||
|
|
||||||
|
|
||||||
|
class ScanCriticalTests(unittest.TestCase):
|
||||||
|
def test_matches_each_category(self):
|
||||||
|
text = "\n".join([
|
||||||
|
"NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus",
|
||||||
|
"Out of memory: Killed process 1234 (PathOfExile)",
|
||||||
|
"mce: [Hardware Error]: CPU 0",
|
||||||
|
"pcieport 0000:00:01.0: AER: Corrected error received",
|
||||||
|
"blk_update_request: I/O error, dev sda, sector 99",
|
||||||
|
"this is a perfectly normal line",
|
||||||
|
])
|
||||||
|
labels = {label for label, _ in syslogs.scan_critical(text)}
|
||||||
|
self.assertEqual(labels, {
|
||||||
|
"GPU error (Xid)", "Out of memory", "CPU machine-check",
|
||||||
|
"PCIe error", "Disk I/O error"})
|
||||||
|
|
||||||
|
def test_clean_log_no_events(self):
|
||||||
|
self.assertEqual(syslogs.scan_critical("usb 1-2: new high-speed device\nsystemd: started"), [])
|
||||||
|
|
||||||
|
|
||||||
class CollectTests(unittest.TestCase):
|
class CollectTests(unittest.TestCase):
|
||||||
def test_collect_combines_sections(self):
|
def test_collect_combines_sections(self):
|
||||||
with mock.patch.object(syslogs, "kernel_log", return_value="NVRM: Xid 79"), \
|
with mock.patch.object(syslogs, "kernel_log", return_value="NVRM: Xid 79"), \
|
||||||
|
|||||||
Reference in New Issue
Block a user