From 2ee7763d008677783aa32e6fa1c4ed55dc22693a Mon Sep 17 00:00:00 2001 From: Jessey van Offeren Date: Fri, 22 May 2026 14:41:13 +0200 Subject: [PATCH 1/2] =?UTF-8?q?feat(m8):=20event-based=20alerts=20?= =?UTF-8?q?=E2=80=94=20Xid/OOM/MCE/PCIe/disk=20from=20the=20kernel=20log?= =?UTF-8?q?=20=E2=80=94=200.34.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned the moment something goes wrong, not only on a temperature threshold. Disk I/O errors come from the kernel log (no root needed, unlike smartctl). Edge/spam protection reuses the existing cooldown model. syslogs.scan_critical() does the matching; init seeds last-scan to "now" so old boot logs don't alert on launch. Tests for the matcher + monitor gating/cooldown; Settings note updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 8 ++++++ pyproject.toml | 2 +- src/rigdoctor/__init__.py | 2 +- src/rigdoctor/core/alerts.py | 46 +++++++++++++++++++++++++++++---- src/rigdoctor/core/syslogs.py | 24 +++++++++++++++++ src/rigdoctor/gui/setup_page.py | 3 ++- tests/test_alerts.py | 30 +++++++++++++++++++++ tests/test_syslogs.py | 19 ++++++++++++++ 8 files changed, 126 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 626c328..35783a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to RigDoctor are recorded here. Format follows (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git release tag (so the auto-updater, D18, can compare versions). +## [0.34.0] - 2026-05-22 +### Added +- **Event-based alerts (M8).** Beyond temperature + GPU-lost, RigDoctor now notifies on + **critical kernel events** — Xid (GPU error), out-of-memory kills, CPU machine-checks, PCIe + AER errors, and disk I/O errors — scanned from the kernel log every ~30s while monitoring and + fired one-shot (cooldown-gated, so no spam). A proactive warning the moment something goes + wrong, not just on a temperature threshold. Included whenever desktop notifications are on. + ## [0.33.0] - 2026-05-22 ### Added - **AI explanations stream live.** "Explain with AI" now fills token-by-token as the model diff --git a/pyproject.toml b/pyproject.toml index 2004b5f..c53d25a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "rigdoctor" -version = "0.33.0" +version = "0.34.0" description = "Modular hardware monitoring & crash diagnostics for Linux gamers." readme = "README.md" requires-python = ">=3.11" diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py index 852713c..4b835db 100644 --- a/src/rigdoctor/__init__.py +++ b/src/rigdoctor/__init__.py @@ -1,3 +1,3 @@ """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers.""" -__version__ = "0.33.0" +__version__ = "0.34.0" diff --git a/src/rigdoctor/core/alerts.py b/src/rigdoctor/core/alerts.py index c976951..c7f4f98 100644 --- a/src/rigdoctor/core/alerts.py +++ b/src/rigdoctor/core/alerts.py @@ -1,8 +1,9 @@ -"""Desktop alerts (M8): notify on overheat / GPU-lost / new version via notify-send. +"""Desktop alerts (M8): notify on overheat / GPU-lost / critical kernel events / new version. -Edge-triggered: an alert fires when a condition becomes true (not every sample), and -can fire again only after it has cleared and a cooldown has passed — so a hot GPU or a -1-Hz sample loop doesn't spam notifications. Degrades to a no-op if notify-send is absent. +Edge-triggered: a sustained condition (hot GPU, GPU-lost) fires once when it becomes true and +can re-fire only after it clears + a cooldown; momentary **kernel events** (Xid, OOM-kill, MCE, +PCIe AER, disk I/O errors) are scanned from the kernel log every `event_interval` seconds and +fire one-shot (cooldown-gated). So a 1-Hz sample loop never spams. No-op if notify-send absent. """ from __future__ import annotations @@ -57,13 +58,16 @@ def notify(title: str, message: str, urgency: str = "normal") -> bool: class AlertMonitor: """Evaluate samples and raise edge-triggered desktop alerts.""" - def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0): + def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0, + event_interval: float = 30.0): self.gpu_temp = gpu_temp self.cpu_temp = cpu_temp self.cooldown = cooldown + self.event_interval = event_interval # how often to scan the kernel log self.enabled = True self._active: dict[str, bool] = {} self._last: dict[str, float] = {} + self._last_kernel_scan = time.time() # only alert on events after the monitor starts def _fire(self, key: str, title: str, message: str, urgency: str = "critical") -> None: if self._active.get(key): @@ -75,9 +79,39 @@ class AlertMonitor: self._last[key] = now notify(title, message, urgency) + def _notify_once(self, key: str, title: str, message: str, urgency: str = "critical") -> None: + """One-shot alert for a momentary event (cooldown-gated, no active latch).""" + now = time.time() + if now - self._last.get(key, 0.0) < self.cooldown: + return + self._last[key] = now + notify(title, message, urgency) + def _clear(self, key: str) -> None: self._active[key] = False + def _scan_kernel_events(self) -> None: + """Periodically scan the kernel log for new critical events (Xid/OOM/MCE/PCIe/disk).""" + now = time.time() + if now - self._last_kernel_scan < self.event_interval: + return + since = self._last_kernel_scan + self._last_kernel_scan = now + try: + from . import syslogs + + text = syslogs.kernel_log(since=since) + except Exception: # alerting must never crash the sample loop + return + if not text: + return + seen: set[str] = set() + for label, line in syslogs.scan_critical(text): + if label in seen: # one alert per category per scan + continue + seen.add(label) + self._notify_once(f"kernel:{label}", label, line[:180]) + def check(self, sample: Sample) -> None: if not self.enabled: return @@ -107,3 +141,5 @@ class AlertMonitor: self._fire("gpu_lost", "GPU not responding", "nvidia-smi query timed out — the GPU may have dropped") else: self._clear("gpu_lost") + + self._scan_kernel_events() # Xid / OOM / MCE / PCIe / disk I/O from the kernel log diff --git a/src/rigdoctor/core/syslogs.py b/src/rigdoctor/core/syslogs.py index 4299070..00e0f8a 100644 --- a/src/rigdoctor/core/syslogs.py +++ b/src/rigdoctor/core/syslogs.py @@ -13,6 +13,7 @@ Best-effort and size-bounded: degrades silently if a tool is missing or access i from __future__ import annotations import os +import re import shutil import subprocess import time @@ -118,6 +119,29 @@ def display_log(since: float | None = None, max_bytes: int = _MAX) -> str: return _tail_file(log, max_bytes) if log else "" +# Kernel-log patterns worth alerting on in real time (M8 event alerts). (label, regex). +_CRITICAL = [ + ("GPU error (Xid)", re.compile(r"NVRM:\s*Xid", re.I)), + ("Out of memory", re.compile(r"out of memory|oom-kill|killed process \d+", re.I)), + ("CPU machine-check", re.compile(r"\bmce:|machine check", re.I)), + ("PCIe error", re.compile(r"\bAER:|pcie bus error", re.I)), + ("Disk I/O error", re.compile( + r"buffer i/o error|\bi/o error\b|critical medium error|ext4-fs error|" + r"blk_update_request:.*error|ata\d+.*(?:failed|error)", re.I)), +] + + +def scan_critical(text: str) -> list[tuple[str, str]]: + """(label, line) for kernel lines matching a critical pattern (first match per line).""" + events: list[tuple[str, str]] = [] + for line in text.splitlines(): + for label, pat in _CRITICAL: + if pat.search(line): + events.append((label, line.strip())) + break + return events + + def available() -> bool: return bool(shutil.which("journalctl") or shutil.which("coredumpctl") or shutil.which("nvidia-smi") or _xorg_log()) diff --git a/src/rigdoctor/gui/setup_page.py b/src/rigdoctor/gui/setup_page.py index d67d414..28f0811 100644 --- a/src/rigdoctor/gui/setup_page.py +++ b/src/rigdoctor/gui/setup_page.py @@ -114,7 +114,8 @@ class SetupPage(QWidget): grid.addWidget(QLabel("CPU temperature alert"), 1, 0) grid.addWidget(self._cpu_alert, 1, 1) alerts_layout.addLayout(grid) - alerts_note = QLabel("GPU-lost and new-version alerts are included whenever notifications are enabled.") + alerts_note = QLabel("GPU-lost, critical kernel events (Xid, out-of-memory, disk I/O, PCIe), " + "and new-version alerts are included whenever notifications are enabled.") alerts_note.setObjectName("Muted") alerts_note.setWordWrap(True) alerts_layout.addWidget(alerts_note) diff --git a/tests/test_alerts.py b/tests/test_alerts.py index f8af1b3..5890456 100644 --- a/tests/test_alerts.py +++ b/tests/test_alerts.py @@ -34,5 +34,35 @@ class AlertTests(unittest.TestCase): m.assert_called_once() +class KernelEventAlertTests(unittest.TestCase): + @mock.patch.object(alerts, "notify") + def test_kernel_event_fires_once_within_cooldown(self, m): + mon = alerts.AlertMonitor(cooldown=300.0, event_interval=0.0) + mon._last_kernel_scan = 0.0 # force a scan + with mock.patch("rigdoctor.core.syslogs.kernel_log", + return_value="NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus"): + mon._scan_kernel_events() + mon._last_kernel_scan = 0.0 # force another scan — cooldown must suppress it + mon._scan_kernel_events() + self.assertEqual(m.call_count, 1) + self.assertIn("Xid", m.call_args[0][0]) + + @mock.patch.object(alerts, "notify") + def test_no_alert_when_kernel_log_empty(self, m): + mon = alerts.AlertMonitor(event_interval=0.0) + mon._last_kernel_scan = 0.0 + with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value=""): + mon._scan_kernel_events() + m.assert_not_called() + + @mock.patch.object(alerts, "notify") + def test_scan_gated_by_interval(self, m): + mon = alerts.AlertMonitor(event_interval=9999.0) # just constructed → not due yet + with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value="NVRM: Xid 79") as kl: + mon._scan_kernel_events() + kl.assert_not_called() + m.assert_not_called() + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_syslogs.py b/tests/test_syslogs.py index 24ce0be..9e3041d 100644 --- a/tests/test_syslogs.py +++ b/tests/test_syslogs.py @@ -72,6 +72,25 @@ class DisplayTests(unittest.TestCase): self.assertTrue(any(a.startswith("_COMM=") for a in cmd)) +class ScanCriticalTests(unittest.TestCase): + def test_matches_each_category(self): + text = "\n".join([ + "NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus", + "Out of memory: Killed process 1234 (PathOfExile)", + "mce: [Hardware Error]: CPU 0", + "pcieport 0000:00:01.0: AER: Corrected error received", + "blk_update_request: I/O error, dev sda, sector 99", + "this is a perfectly normal line", + ]) + labels = {label for label, _ in syslogs.scan_critical(text)} + self.assertEqual(labels, { + "GPU error (Xid)", "Out of memory", "CPU machine-check", + "PCIe error", "Disk I/O error"}) + + def test_clean_log_no_events(self): + self.assertEqual(syslogs.scan_critical("usb 1-2: new high-speed device\nsystemd: started"), []) + + class CollectTests(unittest.TestCase): def test_collect_combines_sections(self): with mock.patch.object(syslogs, "kernel_log", return_value="NVRM: Xid 79"), \ -- 2.52.0 From 2989e8e23e5b08d09e3a5daa5bfeda94976c9bad Mon Sep 17 00:00:00 2001 From: Jessey van Offeren Date: Fri, 22 May 2026 14:42:41 +0200 Subject: [PATCH 2/2] ci: run tests.yml on pull_request only (no push) to avoid double runs A branch with an open PR triggered both the push and pull_request events, running every job twice. Trigger on pull_request only; pushes to main are already tested by release.yml's `test` job. No version bump (CI config only). Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/tests.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitea/workflows/tests.yml b/.gitea/workflows/tests.yml index 0960d07..bc8301b 100644 --- a/.gitea/workflows/tests.yml +++ b/.gitea/workflows/tests.yml @@ -1,14 +1,15 @@ name: tests run-name: Run test suite -# Runs the unittest suite on every push and pull request. Two jobs: +# Runs the unittest suite on pull requests (once per PR). Pushes to main are covered by the +# `test` job in release.yml, so we don't trigger on push here — that would double every run. +# Two jobs: # core — stdlib-only install; the GUI tests skip (@skipUnless HAVE_QT). Bulletproof. # gui-smoke — installs the GUI extra + offscreen Qt libs and runs the same suite headless, # exercising the MainWindow/SetupWizard/DiagnosticDialog construction tests. -# Make `core` a required status check on `main` so a PR can't merge with failing tests. +# Make `tests / core (pull_request)` a required status check on `main` so a PR can't merge red. on: - push: pull_request: jobs: -- 2.52.0