From 2ee7763d008677783aa32e6fa1c4ed55dc22693a Mon Sep 17 00:00:00 2001
From: Jessey van Offeren <jjvanofferen@gmail.com>
Date: Fri, 22 May 2026 14:41:13 +0200
Subject: [PATCH 1/2] =?UTF-8?q?feat(m8):=20event-based=20alerts=20?=
 =?UTF-8?q?=E2=80=94=20Xid/OOM/MCE/PCIe/disk=20from=20the=20kernel=20log?=
 =?UTF-8?q?=20=E2=80=94=200.34.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires
one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM
kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned
the moment something goes wrong, not only on a temperature threshold. Disk I/O
errors come from the kernel log (no root needed, unlike smartctl). Edge/spam
protection reuses the existing cooldown model. syslogs.scan_critical() does the
matching; init seeds last-scan to "now" so old boot logs don't alert on launch.
Tests for the matcher + monitor gating/cooldown; Settings note updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                    |  8 ++++++
 pyproject.toml                  |  2 +-
 src/rigdoctor/__init__.py       |  2 +-
 src/rigdoctor/core/alerts.py    | 46 +++++++++++++++++++++++++++++----
 src/rigdoctor/core/syslogs.py   | 24 +++++++++++++++++
 src/rigdoctor/gui/setup_page.py |  3 ++-
 tests/test_alerts.py            | 30 +++++++++++++++++++++
 tests/test_syslogs.py           | 19 ++++++++++++++
 8 files changed, 126 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 626c328..35783a7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to RigDoctor are recorded here. Format follows
 (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
 release tag (so the auto-updater, D18, can compare versions).
 
+## [0.34.0] - 2026-05-22
+### Added
+- **Event-based alerts (M8).** Beyond temperature + GPU-lost, RigDoctor now notifies on
+  **critical kernel events** — Xid (GPU error), out-of-memory kills, CPU machine-checks, PCIe
+  AER errors, and disk I/O errors — scanned from the kernel log every ~30s while monitoring and
+  fired one-shot (cooldown-gated, so no spam). A proactive warning the moment something goes
+  wrong, not just on a temperature threshold. Included whenever desktop notifications are on.
+
 ## [0.33.0] - 2026-05-22
 ### Added
 - **AI explanations stream live.** "Explain with AI" now fills token-by-token as the model
diff --git a/pyproject.toml b/pyproject.toml
index 2004b5f..c53d25a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rigdoctor"
-version = "0.33.0"
+version = "0.34.0"
 description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py
index 852713c..4b835db 100644
--- a/src/rigdoctor/__init__.py
+++ b/src/rigdoctor/__init__.py
@@ -1,3 +1,3 @@
 """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
 
-__version__ = "0.33.0"
+__version__ = "0.34.0"
diff --git a/src/rigdoctor/core/alerts.py b/src/rigdoctor/core/alerts.py
index c976951..c7f4f98 100644
--- a/src/rigdoctor/core/alerts.py
+++ b/src/rigdoctor/core/alerts.py
@@ -1,8 +1,9 @@
-"""Desktop alerts (M8): notify on overheat / GPU-lost / new version via notify-send.
+"""Desktop alerts (M8): notify on overheat / GPU-lost / critical kernel events / new version.
 
-Edge-triggered: an alert fires when a condition becomes true (not every sample), and
-can fire again only after it has cleared and a cooldown has passed — so a hot GPU or a
-1-Hz sample loop doesn't spam notifications. Degrades to a no-op if notify-send is absent.
+Edge-triggered: a sustained condition (hot GPU, GPU-lost) fires once when it becomes true and
+can re-fire only after it clears + a cooldown; momentary **kernel events** (Xid, OOM-kill, MCE,
+PCIe AER, disk I/O errors) are scanned from the kernel log every `event_interval` seconds and
+fire one-shot (cooldown-gated). So a 1-Hz sample loop never spams. No-op if notify-send absent.
 """
 
 from __future__ import annotations
@@ -57,13 +58,16 @@ def notify(title: str, message: str, urgency: str = "normal") -> bool:
 class AlertMonitor:
     """Evaluate samples and raise edge-triggered desktop alerts."""
 
-    def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0):
+    def __init__(self, gpu_temp: float = 90.0, cpu_temp: float = 95.0, cooldown: float = 300.0,
+                 event_interval: float = 30.0):
         self.gpu_temp = gpu_temp
         self.cpu_temp = cpu_temp
         self.cooldown = cooldown
+        self.event_interval = event_interval     # how often to scan the kernel log
         self.enabled = True
         self._active: dict[str, bool] = {}
         self._last: dict[str, float] = {}
+        self._last_kernel_scan = time.time()     # only alert on events after the monitor starts
 
     def _fire(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
         if self._active.get(key):
@@ -75,9 +79,39 @@ class AlertMonitor:
         self._last[key] = now
         notify(title, message, urgency)
 
+    def _notify_once(self, key: str, title: str, message: str, urgency: str = "critical") -> None:
+        """One-shot alert for a momentary event (cooldown-gated, no active latch)."""
+        now = time.time()
+        if now - self._last.get(key, 0.0) < self.cooldown:
+            return
+        self._last[key] = now
+        notify(title, message, urgency)
+
     def _clear(self, key: str) -> None:
         self._active[key] = False
 
+    def _scan_kernel_events(self) -> None:
+        """Periodically scan the kernel log for new critical events (Xid/OOM/MCE/PCIe/disk)."""
+        now = time.time()
+        if now - self._last_kernel_scan < self.event_interval:
+            return
+        since = self._last_kernel_scan
+        self._last_kernel_scan = now
+        try:
+            from . import syslogs
+
+            text = syslogs.kernel_log(since=since)
+        except Exception:  # alerting must never crash the sample loop
+            return
+        if not text:
+            return
+        seen: set[str] = set()
+        for label, line in syslogs.scan_critical(text):
+            if label in seen:  # one alert per category per scan
+                continue
+            seen.add(label)
+            self._notify_once(f"kernel:{label}", label, line[:180])
+
     def check(self, sample: Sample) -> None:
         if not self.enabled:
             return
@@ -107,3 +141,5 @@ class AlertMonitor:
             self._fire("gpu_lost", "GPU not responding", "nvidia-smi query timed out — the GPU may have dropped")
         else:
             self._clear("gpu_lost")
+
+        self._scan_kernel_events()  # Xid / OOM / MCE / PCIe / disk I/O from the kernel log
diff --git a/src/rigdoctor/core/syslogs.py b/src/rigdoctor/core/syslogs.py
index 4299070..00e0f8a 100644
--- a/src/rigdoctor/core/syslogs.py
+++ b/src/rigdoctor/core/syslogs.py
@@ -13,6 +13,7 @@ Best-effort and size-bounded: degrades silently if a tool is missing or access i
 from __future__ import annotations
 
 import os
+import re
 import shutil
 import subprocess
 import time
@@ -118,6 +119,29 @@ def display_log(since: float | None = None, max_bytes: int = _MAX) -> str:
     return _tail_file(log, max_bytes) if log else ""
 
 
+# Kernel-log patterns worth alerting on in real time (M8 event alerts). (label, regex).
+_CRITICAL = [
+    ("GPU error (Xid)", re.compile(r"NVRM:\s*Xid", re.I)),
+    ("Out of memory", re.compile(r"out of memory|oom-kill|killed process \d+", re.I)),
+    ("CPU machine-check", re.compile(r"\bmce:|machine check", re.I)),
+    ("PCIe error", re.compile(r"\bAER:|pcie bus error", re.I)),
+    ("Disk I/O error", re.compile(
+        r"buffer i/o error|\bi/o error\b|critical medium error|ext4-fs error|"
+        r"blk_update_request:.*error|ata\d+.*(?:failed|error)", re.I)),
+]
+
+
+def scan_critical(text: str) -> list[tuple[str, str]]:
+    """(label, line) for kernel lines matching a critical pattern (first match per line)."""
+    events: list[tuple[str, str]] = []
+    for line in text.splitlines():
+        for label, pat in _CRITICAL:
+            if pat.search(line):
+                events.append((label, line.strip()))
+                break
+    return events
+
+
 def available() -> bool:
     return bool(shutil.which("journalctl") or shutil.which("coredumpctl")
                 or shutil.which("nvidia-smi") or _xorg_log())
diff --git a/src/rigdoctor/gui/setup_page.py b/src/rigdoctor/gui/setup_page.py
index d67d414..28f0811 100644
--- a/src/rigdoctor/gui/setup_page.py
+++ b/src/rigdoctor/gui/setup_page.py
@@ -114,7 +114,8 @@ class SetupPage(QWidget):
         grid.addWidget(QLabel("CPU temperature alert"), 1, 0)
         grid.addWidget(self._cpu_alert, 1, 1)
         alerts_layout.addLayout(grid)
-        alerts_note = QLabel("GPU-lost and new-version alerts are included whenever notifications are enabled.")
+        alerts_note = QLabel("GPU-lost, critical kernel events (Xid, out-of-memory, disk I/O, PCIe), "
+                             "and new-version alerts are included whenever notifications are enabled.")
         alerts_note.setObjectName("Muted")
         alerts_note.setWordWrap(True)
         alerts_layout.addWidget(alerts_note)
diff --git a/tests/test_alerts.py b/tests/test_alerts.py
index f8af1b3..5890456 100644
--- a/tests/test_alerts.py
+++ b/tests/test_alerts.py
@@ -34,5 +34,35 @@ class AlertTests(unittest.TestCase):
         m.assert_called_once()
 
 
+class KernelEventAlertTests(unittest.TestCase):
+    @mock.patch.object(alerts, "notify")
+    def test_kernel_event_fires_once_within_cooldown(self, m):
+        mon = alerts.AlertMonitor(cooldown=300.0, event_interval=0.0)
+        mon._last_kernel_scan = 0.0  # force a scan
+        with mock.patch("rigdoctor.core.syslogs.kernel_log",
+                        return_value="NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus"):
+            mon._scan_kernel_events()
+            mon._last_kernel_scan = 0.0  # force another scan — cooldown must suppress it
+            mon._scan_kernel_events()
+        self.assertEqual(m.call_count, 1)
+        self.assertIn("Xid", m.call_args[0][0])
+
+    @mock.patch.object(alerts, "notify")
+    def test_no_alert_when_kernel_log_empty(self, m):
+        mon = alerts.AlertMonitor(event_interval=0.0)
+        mon._last_kernel_scan = 0.0
+        with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value=""):
+            mon._scan_kernel_events()
+        m.assert_not_called()
+
+    @mock.patch.object(alerts, "notify")
+    def test_scan_gated_by_interval(self, m):
+        mon = alerts.AlertMonitor(event_interval=9999.0)  # just constructed → not due yet
+        with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value="NVRM: Xid 79") as kl:
+            mon._scan_kernel_events()
+        kl.assert_not_called()
+        m.assert_not_called()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_syslogs.py b/tests/test_syslogs.py
index 24ce0be..9e3041d 100644
--- a/tests/test_syslogs.py
+++ b/tests/test_syslogs.py
@@ -72,6 +72,25 @@ class DisplayTests(unittest.TestCase):
         self.assertTrue(any(a.startswith("_COMM=") for a in cmd))
 
 
+class ScanCriticalTests(unittest.TestCase):
+    def test_matches_each_category(self):
+        text = "\n".join([
+            "NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus",
+            "Out of memory: Killed process 1234 (PathOfExile)",
+            "mce: [Hardware Error]: CPU 0",
+            "pcieport 0000:00:01.0: AER: Corrected error received",
+            "blk_update_request: I/O error, dev sda, sector 99",
+            "this is a perfectly normal line",
+        ])
+        labels = {label for label, _ in syslogs.scan_critical(text)}
+        self.assertEqual(labels, {
+            "GPU error (Xid)", "Out of memory", "CPU machine-check",
+            "PCIe error", "Disk I/O error"})
+
+    def test_clean_log_no_events(self):
+        self.assertEqual(syslogs.scan_critical("usb 1-2: new high-speed device\nsystemd: started"), [])
+
+
 class CollectTests(unittest.TestCase):
     def test_collect_combines_sections(self):
         with mock.patch.object(syslogs, "kernel_log", return_value="NVRM: Xid 79"), \
-- 
2.52.0


From 2989e8e23e5b08d09e3a5daa5bfeda94976c9bad Mon Sep 17 00:00:00 2001
From: Jessey van Offeren <jjvanofferen@gmail.com>
Date: Fri, 22 May 2026 14:42:41 +0200
Subject: [PATCH 2/2] ci: run tests.yml on pull_request only (no push) to avoid
 double runs

A branch with an open PR triggered both the push and pull_request events, running
every job twice. Trigger on pull_request only; pushes to main are already tested
by release.yml's `test` job. No version bump (CI config only).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitea/workflows/tests.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.gitea/workflows/tests.yml b/.gitea/workflows/tests.yml
index 0960d07..bc8301b 100644
--- a/.gitea/workflows/tests.yml
+++ b/.gitea/workflows/tests.yml
@@ -1,14 +1,15 @@
 name: tests
 run-name: Run test suite
 
-# Runs the unittest suite on every push and pull request. Two jobs:
+# Runs the unittest suite on pull requests (once per PR). Pushes to main are covered by the
+# `test` job in release.yml, so we don't trigger on push here — that would double every run.
+# Two jobs:
 #   core      — stdlib-only install; the GUI tests skip (@skipUnless HAVE_QT). Bulletproof.
 #   gui-smoke — installs the GUI extra + offscreen Qt libs and runs the same suite headless,
 #               exercising the MainWindow/SetupWizard/DiagnosticDialog construction tests.
-# Make `core` a required status check on `main` so a PR can't merge with failing tests.
+# Make `tests / core (pull_request)` a required status check on `main` so a PR can't merge red.
 
 on:
-  push:
   pull_request:
 
 jobs:
-- 
2.52.0