feat(m8): event-based alerts — Xid/OOM/MCE/PCIe/disk from the kernel log — 0.34.0
AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned the moment something goes wrong, not only on a temperature threshold. Disk I/O errors come from the kernel log (no root needed, unlike smartctl). Edge/spam protection reuses the existing cooldown model. syslogs.scan_critical() does the matching; init seeds last-scan to "now" so old boot logs don't alert on launch. Tests for the matcher + monitor gating/cooldown; Settings note updated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -34,5 +34,35 @@ class AlertTests(unittest.TestCase):
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
class KernelEventAlertTests(unittest.TestCase):
|
||||
@mock.patch.object(alerts, "notify")
|
||||
def test_kernel_event_fires_once_within_cooldown(self, m):
|
||||
mon = alerts.AlertMonitor(cooldown=300.0, event_interval=0.0)
|
||||
mon._last_kernel_scan = 0.0 # force a scan
|
||||
with mock.patch("rigdoctor.core.syslogs.kernel_log",
|
||||
return_value="NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus"):
|
||||
mon._scan_kernel_events()
|
||||
mon._last_kernel_scan = 0.0 # force another scan — cooldown must suppress it
|
||||
mon._scan_kernel_events()
|
||||
self.assertEqual(m.call_count, 1)
|
||||
self.assertIn("Xid", m.call_args[0][0])
|
||||
|
||||
@mock.patch.object(alerts, "notify")
|
||||
def test_no_alert_when_kernel_log_empty(self, m):
|
||||
mon = alerts.AlertMonitor(event_interval=0.0)
|
||||
mon._last_kernel_scan = 0.0
|
||||
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value=""):
|
||||
mon._scan_kernel_events()
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch.object(alerts, "notify")
|
||||
def test_scan_gated_by_interval(self, m):
|
||||
mon = alerts.AlertMonitor(event_interval=9999.0) # just constructed → not due yet
|
||||
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value="NVRM: Xid 79") as kl:
|
||||
mon._scan_kernel_events()
|
||||
kl.assert_not_called()
|
||||
m.assert_not_called()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -72,6 +72,25 @@ class DisplayTests(unittest.TestCase):
|
||||
self.assertTrue(any(a.startswith("_COMM=") for a in cmd))
|
||||
|
||||
|
||||
class ScanCriticalTests(unittest.TestCase):
|
||||
def test_matches_each_category(self):
|
||||
text = "\n".join([
|
||||
"NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus",
|
||||
"Out of memory: Killed process 1234 (PathOfExile)",
|
||||
"mce: [Hardware Error]: CPU 0",
|
||||
"pcieport 0000:00:01.0: AER: Corrected error received",
|
||||
"blk_update_request: I/O error, dev sda, sector 99",
|
||||
"this is a perfectly normal line",
|
||||
])
|
||||
labels = {label for label, _ in syslogs.scan_critical(text)}
|
||||
self.assertEqual(labels, {
|
||||
"GPU error (Xid)", "Out of memory", "CPU machine-check",
|
||||
"PCIe error", "Disk I/O error"})
|
||||
|
||||
def test_clean_log_no_events(self):
|
||||
self.assertEqual(syslogs.scan_critical("usb 1-2: new high-speed device\nsystemd: started"), [])
|
||||
|
||||
|
||||
class CollectTests(unittest.TestCase):
|
||||
def test_collect_combines_sections(self):
|
||||
with mock.patch.object(syslogs, "kernel_log", return_value="NVRM: Xid 79"), \
|
||||
|
||||
Reference in New Issue
Block a user