feat(health): detect no-Xid GPU freezes (open-module VA-space faults)
The kernel-log scanner only caught Xid codes, OOM, panic, MCE, AER, thermal, and amdgpu resets — so a hard freeze that logs NO Xid slipped through entirely. Add detection for the NVIDIA open-kernel-module VA-space mapping fault (gpu_vaspace.c / dmaAllocMapping / NVKMS GEM-allocation failures), which can storm for minutes and end in a freeze without the GPU ever "falling off the bus". Also flag when the open kernel module (nvidia-*-open) is loaded — the context behind these faults — and add an AI-knowledge entry so the assistant distinguishes it from the Xid 79 hardware drop. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -11,11 +11,19 @@ from rigdoctor.core.health import (
|
||||
WARNING,
|
||||
check_displays,
|
||||
check_memory_speed,
|
||||
check_nvidia_module,
|
||||
check_pcie_links,
|
||||
run_health_checks,
|
||||
scan_journal_text,
|
||||
)
|
||||
|
||||
# A real no-Xid freeze: the open-module VA-space storm captured on 2026-05-29.
|
||||
_VASPACE_LOG = """\
|
||||
NVRM: nvCheckFailedNoLog: Check failed: 0 == (pMapNode->gpuMask & gpuMask) @ gpu_vaspace.c:4547
|
||||
NVRM: dmaAllocMapping_GM107: can't update VA space for mapping @vaddr=0x4be00000
|
||||
[drm:nv_drm_gem_alloc_nvkms_memory_ioctl [nvidia_drm]] *ERROR* Failed to allocate NVKMS memory for GEM object
|
||||
"""
|
||||
|
||||
|
||||
class HealthScanTests(unittest.TestCase):
|
||||
def test_xid_79_is_critical(self):
|
||||
@@ -44,6 +52,28 @@ class HealthScanTests(unittest.TestCase):
|
||||
def test_clean_text_yields_no_findings(self):
|
||||
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
|
||||
|
||||
def test_vaspace_freeze_detected_without_any_xid(self):
|
||||
findings = scan_journal_text(_VASPACE_LOG)
|
||||
gpu = [f for f in findings if f.category == "GPU"]
|
||||
self.assertEqual(len(gpu), 1)
|
||||
self.assertEqual(gpu[0].severity, WARNING)
|
||||
self.assertIn("VA-space", gpu[0].title)
|
||||
# It must NOT be misreported as an Xid finding (the log has no Xid at all).
|
||||
self.assertNotIn("Xid", gpu[0].title)
|
||||
self.assertIn("open kernel module", gpu[0].detail.lower())
|
||||
|
||||
def test_open_module_finding_when_open_loaded(self):
|
||||
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=True):
|
||||
findings = check_nvidia_module()
|
||||
self.assertEqual(len(findings), 1)
|
||||
self.assertEqual(findings[0].severity, INFO)
|
||||
self.assertEqual(findings[0].category, "Driver")
|
||||
|
||||
def test_no_module_finding_when_proprietary_or_absent(self):
|
||||
for state in (False, None):
|
||||
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=state):
|
||||
self.assertEqual(check_nvidia_module(), [])
|
||||
|
||||
def test_run_health_checks_returns_findings(self):
|
||||
# Runs against the real system; just assert it returns a sorted list of Findings.
|
||||
findings = run_health_checks()
|
||||
|
||||
Reference in New Issue
Block a user