Files
jessey b1bc961b79 feat(health): detect no-Xid GPU freezes (open-module VA-space faults)
The kernel-log scanner only caught Xid codes, OOM, panic, MCE, AER, thermal,
and amdgpu resets — so a hard freeze that logs NO Xid slipped through entirely.
Add detection for the NVIDIA open-kernel-module VA-space mapping fault
(gpu_vaspace.c / dmaAllocMapping / NVKMS GEM-allocation failures), which can
storm for minutes and end in a freeze without the GPU ever "falling off the
bus". Also flag when the open kernel module (nvidia-*-open) is loaded — the
context behind these faults — and add an AI-knowledge entry so the assistant
distinguishes it from the Xid 79 hardware drop.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 16:07:14 +02:00

154 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the M4 health report's log scanner (synthetic input)."""
import unittest
from pathlib import Path
from unittest import mock
from rigdoctor.core import displays, health
from rigdoctor.core.health import (
CRITICAL,
INFO,
WARNING,
check_displays,
check_memory_speed,
check_nvidia_module,
check_pcie_links,
run_health_checks,
scan_journal_text,
)
# A real no-Xid freeze: the open-module VA-space storm captured on 2026-05-29.
_VASPACE_LOG = """\
NVRM: nvCheckFailedNoLog: Check failed: 0 == (pMapNode->gpuMask & gpuMask) @ gpu_vaspace.c:4547
NVRM: dmaAllocMapping_GM107: can't update VA space for mapping @vaddr=0x4be00000
[drm:nv_drm_gem_alloc_nvkms_memory_ioctl [nvidia_drm]] *ERROR* Failed to allocate NVKMS memory for GEM object
"""
class HealthScanTests(unittest.TestCase):
def test_xid_79_is_critical(self):
text = "NVRM: Xid (PCI:0000:01:00): 79, pid=1234, GPU has fallen off the bus."
findings = scan_journal_text(text)
gpu = [f for f in findings if f.category == "GPU"]
self.assertEqual(len(gpu), 1)
self.assertIn("79", gpu[0].title)
self.assertEqual(gpu[0].severity, CRITICAL)
def test_xid_count_aggregates(self):
text = "\n".join(["NVRM: Xid (PCI:0000:01:00): 79, foo"] * 3)
gpu = [f for f in scan_journal_text(text) if f.category == "GPU"][0]
self.assertIn("×3", gpu.title)
def test_oom_and_panic_detected(self):
text = "Out of memory: Killed process 999 (game)\nKernel panic - not syncing: x"
cats = {f.category for f in scan_journal_text(text)}
self.assertIn("Memory", cats)
self.assertIn("Kernel", cats)
def test_mce_critical(self):
findings = scan_journal_text("mce: [Hardware Error]: Machine check events logged")
self.assertTrue(any(f.severity == CRITICAL and f.category == "Hardware" for f in findings))
def test_clean_text_yields_no_findings(self):
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
def test_vaspace_freeze_detected_without_any_xid(self):
findings = scan_journal_text(_VASPACE_LOG)
gpu = [f for f in findings if f.category == "GPU"]
self.assertEqual(len(gpu), 1)
self.assertEqual(gpu[0].severity, WARNING)
self.assertIn("VA-space", gpu[0].title)
# It must NOT be misreported as an Xid finding (the log has no Xid at all).
self.assertNotIn("Xid", gpu[0].title)
self.assertIn("open kernel module", gpu[0].detail.lower())
def test_open_module_finding_when_open_loaded(self):
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=True):
findings = check_nvidia_module()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, INFO)
self.assertEqual(findings[0].category, "Driver")
def test_no_module_finding_when_proprietary_or_absent(self):
for state in (False, None):
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=state):
self.assertEqual(check_nvidia_module(), [])
def test_run_health_checks_returns_findings(self):
# Runs against the real system; just assert it returns a sorted list of Findings.
findings = run_health_checks()
self.assertIsInstance(findings, list)
severities = [f.severity for f in findings]
order = {"critical": 0, "warning": 1, "info": 2, "ok": 3}
ranks = [order.get(s, 9) for s in severities]
self.assertEqual(ranks, sorted(ranks))
class PcieLinkCheckTests(unittest.TestCase):
def _with_link(self, cur_g, cur_w, max_g, max_w):
# one fake NVMe controller returning the given link tuple
return (mock.patch("rigdoctor.core.inventory.nvme_controllers",
return_value=[("nvme0", Path("/x"))]),
mock.patch("rigdoctor.core.inventory.read_link",
return_value=(cur_g, cur_w, max_g, max_w)))
def test_reduced_width_is_a_warning_about_lane_sharing(self):
ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4
with ctrls, link:
findings = check_pcie_links()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, WARNING)
self.assertIn("lane-sharing", findings[0].detail)
def test_reduced_speed_only_is_info(self):
ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4
with ctrls, link:
findings = check_pcie_links()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, INFO)
def test_full_speed_no_finding(self):
ctrls, link = self._with_link(4, "4", 4, "4")
with ctrls, link:
self.assertEqual(check_pcie_links(), [])
class DisplayCheckTests(unittest.TestCase):
def test_lower_than_max_refresh_is_flagged(self):
mon = displays.Monitor("DP-1", "Samsung LC34G55T", 3440, 1440, 60.0, 165.0)
with mock.patch("rigdoctor.core.displays.collect", return_value=[mon]):
findings = check_displays()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, INFO)
self.assertIn("165", findings[0].title)
def test_at_max_refresh_no_finding(self):
mon = displays.Monitor("DP-1", "Samsung LC34G55T", 3440, 1440, 165.0, 165.0)
with mock.patch("rigdoctor.core.displays.collect", return_value=[mon]):
self.assertEqual(check_displays(), [])
class MemorySpeedCheckTests(unittest.TestCase):
def _dmi(self, configured, part):
return {"memory": [{"Configured Memory Speed": configured, "Speed": configured,
"Part Number": part}]}
def test_flags_unapplied_expo(self):
dmi = self._dmi("4800 MT/s", "CMK32GX5M2B5600Z36")
with mock.patch("rigdoctor.core.elevation.privileged", return_value=None), \
mock.patch("rigdoctor.core.inventory._dmidecode", return_value=dmi):
findings = check_memory_speed()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, INFO)
self.assertIn("5600", findings[0].title)
def test_no_flag_at_rated(self):
dmi = self._dmi("5600 MT/s", "CMK32GX5M2B5600Z36")
with mock.patch("rigdoctor.core.elevation.privileged", return_value=None), \
mock.patch("rigdoctor.core.inventory._dmidecode", return_value=dmi):
self.assertEqual(check_memory_speed(), [])
if __name__ == "__main__":
unittest.main()