b1bc961b79
The kernel-log scanner only caught Xid codes, OOM, panic, MCE, AER, thermal, and amdgpu resets — so a hard freeze that logs NO Xid slipped through entirely. Add detection for the NVIDIA open-kernel-module VA-space mapping fault (gpu_vaspace.c / dmaAllocMapping / NVKMS GEM-allocation failures), which can storm for minutes and end in a freeze without the GPU ever "falling off the bus". Also flag when the open kernel module (nvidia-*-open) is loaded — the context behind these faults — and add an AI-knowledge entry so the assistant distinguishes it from the Xid 79 hardware drop. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
154 lines
6.5 KiB
Python
154 lines
6.5 KiB
Python
"""Tests for the M4 health report's log scanner (synthetic input)."""
|
||
|
||
import unittest
|
||
from pathlib import Path
|
||
from unittest import mock
|
||
|
||
from rigdoctor.core import displays, health
|
||
from rigdoctor.core.health import (
|
||
CRITICAL,
|
||
INFO,
|
||
WARNING,
|
||
check_displays,
|
||
check_memory_speed,
|
||
check_nvidia_module,
|
||
check_pcie_links,
|
||
run_health_checks,
|
||
scan_journal_text,
|
||
)
|
||
|
||
# A real no-Xid freeze: the open-module VA-space storm captured on 2026-05-29.
|
||
_VASPACE_LOG = """\
|
||
NVRM: nvCheckFailedNoLog: Check failed: 0 == (pMapNode->gpuMask & gpuMask) @ gpu_vaspace.c:4547
|
||
NVRM: dmaAllocMapping_GM107: can't update VA space for mapping @vaddr=0x4be00000
|
||
[drm:nv_drm_gem_alloc_nvkms_memory_ioctl [nvidia_drm]] *ERROR* Failed to allocate NVKMS memory for GEM object
|
||
"""
|
||
|
||
|
||
class HealthScanTests(unittest.TestCase):
|
||
def test_xid_79_is_critical(self):
|
||
text = "NVRM: Xid (PCI:0000:01:00): 79, pid=1234, GPU has fallen off the bus."
|
||
findings = scan_journal_text(text)
|
||
gpu = [f for f in findings if f.category == "GPU"]
|
||
self.assertEqual(len(gpu), 1)
|
||
self.assertIn("79", gpu[0].title)
|
||
self.assertEqual(gpu[0].severity, CRITICAL)
|
||
|
||
def test_xid_count_aggregates(self):
|
||
text = "\n".join(["NVRM: Xid (PCI:0000:01:00): 79, foo"] * 3)
|
||
gpu = [f for f in scan_journal_text(text) if f.category == "GPU"][0]
|
||
self.assertIn("×3", gpu.title)
|
||
|
||
def test_oom_and_panic_detected(self):
|
||
text = "Out of memory: Killed process 999 (game)\nKernel panic - not syncing: x"
|
||
cats = {f.category for f in scan_journal_text(text)}
|
||
self.assertIn("Memory", cats)
|
||
self.assertIn("Kernel", cats)
|
||
|
||
def test_mce_critical(self):
|
||
findings = scan_journal_text("mce: [Hardware Error]: Machine check events logged")
|
||
self.assertTrue(any(f.severity == CRITICAL and f.category == "Hardware" for f in findings))
|
||
|
||
def test_clean_text_yields_no_findings(self):
|
||
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
|
||
|
||
def test_vaspace_freeze_detected_without_any_xid(self):
|
||
findings = scan_journal_text(_VASPACE_LOG)
|
||
gpu = [f for f in findings if f.category == "GPU"]
|
||
self.assertEqual(len(gpu), 1)
|
||
self.assertEqual(gpu[0].severity, WARNING)
|
||
self.assertIn("VA-space", gpu[0].title)
|
||
# It must NOT be misreported as an Xid finding (the log has no Xid at all).
|
||
self.assertNotIn("Xid", gpu[0].title)
|
||
self.assertIn("open kernel module", gpu[0].detail.lower())
|
||
|
||
def test_open_module_finding_when_open_loaded(self):
|
||
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=True):
|
||
findings = check_nvidia_module()
|
||
self.assertEqual(len(findings), 1)
|
||
self.assertEqual(findings[0].severity, INFO)
|
||
self.assertEqual(findings[0].category, "Driver")
|
||
|
||
def test_no_module_finding_when_proprietary_or_absent(self):
|
||
for state in (False, None):
|
||
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=state):
|
||
self.assertEqual(check_nvidia_module(), [])
|
||
|
||
def test_run_health_checks_returns_findings(self):
|
||
# Runs against the real system; just assert it returns a sorted list of Findings.
|
||
findings = run_health_checks()
|
||
self.assertIsInstance(findings, list)
|
||
severities = [f.severity for f in findings]
|
||
order = {"critical": 0, "warning": 1, "info": 2, "ok": 3}
|
||
ranks = [order.get(s, 9) for s in severities]
|
||
self.assertEqual(ranks, sorted(ranks))
|
||
|
||
|
||
class PcieLinkCheckTests(unittest.TestCase):
|
||
def _with_link(self, cur_g, cur_w, max_g, max_w):
|
||
# one fake NVMe controller returning the given link tuple
|
||
return (mock.patch("rigdoctor.core.inventory.nvme_controllers",
|
||
return_value=[("nvme0", Path("/x"))]),
|
||
mock.patch("rigdoctor.core.inventory.read_link",
|
||
return_value=(cur_g, cur_w, max_g, max_w)))
|
||
|
||
def test_reduced_width_is_a_warning_about_lane_sharing(self):
|
||
ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4
|
||
with ctrls, link:
|
||
findings = check_pcie_links()
|
||
self.assertEqual(len(findings), 1)
|
||
self.assertEqual(findings[0].severity, WARNING)
|
||
self.assertIn("lane-sharing", findings[0].detail)
|
||
|
||
def test_reduced_speed_only_is_info(self):
|
||
ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4
|
||
with ctrls, link:
|
||
findings = check_pcie_links()
|
||
self.assertEqual(len(findings), 1)
|
||
self.assertEqual(findings[0].severity, INFO)
|
||
|
||
def test_full_speed_no_finding(self):
|
||
ctrls, link = self._with_link(4, "4", 4, "4")
|
||
with ctrls, link:
|
||
self.assertEqual(check_pcie_links(), [])
|
||
|
||
|
||
class DisplayCheckTests(unittest.TestCase):
|
||
def test_lower_than_max_refresh_is_flagged(self):
|
||
mon = displays.Monitor("DP-1", "Samsung LC34G55T", 3440, 1440, 60.0, 165.0)
|
||
with mock.patch("rigdoctor.core.displays.collect", return_value=[mon]):
|
||
findings = check_displays()
|
||
self.assertEqual(len(findings), 1)
|
||
self.assertEqual(findings[0].severity, INFO)
|
||
self.assertIn("165", findings[0].title)
|
||
|
||
def test_at_max_refresh_no_finding(self):
|
||
mon = displays.Monitor("DP-1", "Samsung LC34G55T", 3440, 1440, 165.0, 165.0)
|
||
with mock.patch("rigdoctor.core.displays.collect", return_value=[mon]):
|
||
self.assertEqual(check_displays(), [])
|
||
|
||
|
||
class MemorySpeedCheckTests(unittest.TestCase):
|
||
def _dmi(self, configured, part):
|
||
return {"memory": [{"Configured Memory Speed": configured, "Speed": configured,
|
||
"Part Number": part}]}
|
||
|
||
def test_flags_unapplied_expo(self):
|
||
dmi = self._dmi("4800 MT/s", "CMK32GX5M2B5600Z36")
|
||
with mock.patch("rigdoctor.core.elevation.privileged", return_value=None), \
|
||
mock.patch("rigdoctor.core.inventory._dmidecode", return_value=dmi):
|
||
findings = check_memory_speed()
|
||
self.assertEqual(len(findings), 1)
|
||
self.assertEqual(findings[0].severity, INFO)
|
||
self.assertIn("5600", findings[0].title)
|
||
|
||
def test_no_flag_at_rated(self):
|
||
dmi = self._dmi("5600 MT/s", "CMK32GX5M2B5600Z36")
|
||
with mock.patch("rigdoctor.core.elevation.privileged", return_value=None), \
|
||
mock.patch("rigdoctor.core.inventory._dmidecode", return_value=dmi):
|
||
self.assertEqual(check_memory_speed(), [])
|
||
|
||
|
||
if __name__ == "__main__":
|
||
unittest.main()
|