2ee7763d00
AlertMonitor now scans the kernel log (journalctl -k) every ~30s and fires one-shot, cooldown-gated desktop alerts on critical events: NVIDIA Xid, OOM kills, CPU machine-checks, PCIe AER, and disk I/O errors — so users are warned the moment something goes wrong, not only on a temperature threshold. Disk I/O errors come from the kernel log (no root needed, unlike smartctl). Edge/spam protection reuses the existing cooldown model. syslogs.scan_critical() does the matching; init seeds last-scan to "now" so old boot logs don't alert on launch. Tests for the matcher + monitor gating/cooldown; Settings note updated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
69 lines
2.6 KiB
Python
69 lines
2.6 KiB
Python
"""Tests for the M8 alert monitor (edge-triggered; notify mocked)."""
|
|
|
|
import unittest
|
|
from unittest import mock
|
|
|
|
from rigdoctor.core import alerts
|
|
from rigdoctor.core.sample import Reading, Sample
|
|
|
|
|
|
def _gpu(temp):
|
|
return Sample(readings=[Reading("gpu", "temp", temp, "°C")])
|
|
|
|
|
|
class AlertTests(unittest.TestCase):
|
|
@mock.patch.object(alerts, "notify")
|
|
def test_edge_triggered_no_repeat(self, m):
|
|
mon = alerts.AlertMonitor(gpu_temp=90.0, cooldown=0.0)
|
|
mon.check(_gpu(95)) # fires
|
|
mon.check(_gpu(96)) # still hot — no repeat while active
|
|
self.assertEqual(m.call_count, 1)
|
|
mon.check(_gpu(50)) # clears
|
|
mon.check(_gpu(95)) # hot again — fires
|
|
self.assertEqual(m.call_count, 2)
|
|
|
|
@mock.patch.object(alerts, "notify")
|
|
def test_no_alert_below_threshold(self, m):
|
|
alerts.AlertMonitor(gpu_temp=90.0).check(_gpu(70))
|
|
m.assert_not_called()
|
|
|
|
@mock.patch.object(alerts, "notify")
|
|
def test_gpu_lost(self, m):
|
|
mon = alerts.AlertMonitor()
|
|
mon.check(Sample(readings=[Reading("gpu", "status", None, "", "query-timeout")]))
|
|
m.assert_called_once()
|
|
|
|
|
|
class KernelEventAlertTests(unittest.TestCase):
|
|
@mock.patch.object(alerts, "notify")
|
|
def test_kernel_event_fires_once_within_cooldown(self, m):
|
|
mon = alerts.AlertMonitor(cooldown=300.0, event_interval=0.0)
|
|
mon._last_kernel_scan = 0.0 # force a scan
|
|
with mock.patch("rigdoctor.core.syslogs.kernel_log",
|
|
return_value="NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus"):
|
|
mon._scan_kernel_events()
|
|
mon._last_kernel_scan = 0.0 # force another scan — cooldown must suppress it
|
|
mon._scan_kernel_events()
|
|
self.assertEqual(m.call_count, 1)
|
|
self.assertIn("Xid", m.call_args[0][0])
|
|
|
|
@mock.patch.object(alerts, "notify")
|
|
def test_no_alert_when_kernel_log_empty(self, m):
|
|
mon = alerts.AlertMonitor(event_interval=0.0)
|
|
mon._last_kernel_scan = 0.0
|
|
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value=""):
|
|
mon._scan_kernel_events()
|
|
m.assert_not_called()
|
|
|
|
@mock.patch.object(alerts, "notify")
|
|
def test_scan_gated_by_interval(self, m):
|
|
mon = alerts.AlertMonitor(event_interval=9999.0) # just constructed → not due yet
|
|
with mock.patch("rigdoctor.core.syslogs.kernel_log", return_value="NVRM: Xid 79") as kl:
|
|
mon._scan_kernel_events()
|
|
kl.assert_not_called()
|
|
m.assert_not_called()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|