Files
rigdoctor/tests/test_alerts.py
T
jessey 2ee7763d00
tests / core (push) Successful in 12s
tests / gui-smoke (push) Successful in 27s
tests / core (pull_request) Successful in 12s
tests / gui-smoke (pull_request) Successful in 26s
feat(m8): event-based alerts — Xid/OOM/MCE/PCIe/disk from the kernel log — 0.34.0
AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires
one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM
kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned
the moment something goes wrong, not only on a temperature threshold. Disk I/O
errors come from the kernel log (no root needed, unlike smartctl). Edge/spam
protection reuses the existing cooldown model. syslogs.scan_critical() does the
matching; init seeds last-scan to "now" so old boot logs don't alert on launch.
Tests for the matcher + monitor gating/cooldown; Settings note updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 14:41:13 +02:00

69 lines
2.6 KiB
Python

"""Tests for the M8 alert monitor (edge-triggered; notify mocked)."""
import unittest
from unittest import mock
from rigdoctor.core import alerts
from rigdoctor.core.sample import Reading, Sample
def _gpu(temp):
return Sample(readings=[Reading("gpu", "temp", temp, "°C")])
class AlertTests(unittest.TestCase):
@mock.patch.object(alerts, "notify")
def test_edge_triggered_no_repeat(self, m):
mon = alerts.AlertMonitor(gpu_temp=90.0, cooldown=0.0)
mon.check(_gpu(95)) # fires
mon.check(_gpu(96)) # still hot — no repeat while active
self.assertEqual(m.call_count, 1)
mon.check(_gpu(50)) # clears
mon.check(_gpu(95)) # hot again — fires
self.assertEqual(m.call_count, 2)
@mock.patch.object(alerts, "notify")
def test_no_alert_below_threshold(self, m):
alerts.AlertMonitor(gpu_temp=90.0).check(_gpu(70))
m.assert_not_called()
@mock.patch.object(alerts, "notify")
def test_gpu_lost(self, m):
mon = alerts.AlertMonitor()
mon.check(Sample(readings=[Reading("gpu", "status", None, "", "query-timeout")]))
m.assert_called_once()
class KernelEventAlertTests(unittest.TestCase):
@mock.patch.object(alerts, "notify")
def test_kernel_event_fires_once_within_cooldown(self, m):
mon = alerts.AlertMonitor(cooldown=300.0, event_interval=0.0)
mon._last_kernel_scan = 0.0 # force a scan
with mock.patch("rigdoctor.core.syslogs.kernel_log",
return_value="NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus"):
mon._scan_kernel_events()
mon._last_kernel_scan = 0.0 # force another scan — cooldown must suppress it
mon._scan_kernel_events()
self.assertEqual(m.call_count, 1)
self.assertIn("Xid", m.call_args[0][0])
@mock.patch.object(alerts, "notify")
def test_no_alert_when_kernel_log_empty(self, m):
mon = alerts.AlertMonitor(event_interval=0.0)
mon._last_kernel_scan = 0.0
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value=""):
mon._scan_kernel_events()
m.assert_not_called()
@mock.patch.object(alerts, "notify")
def test_scan_gated_by_interval(self, m):
mon = alerts.AlertMonitor(event_interval=9999.0) # just constructed → not due yet
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value="NVRM: Xid 79") as kl:
mon._scan_kernel_events()
kl.assert_not_called()
m.assert_not_called()
if __name__ == "__main__":
unittest.main()