feat(health): detect no-Xid GPU freezes (open-module VA-space faults)
The kernel-log scanner only caught Xid codes, OOM, panic, MCE, AER, thermal, and amdgpu resets — so a hard freeze that logs NO Xid slipped through entirely. Add detection for the NVIDIA open-kernel-module VA-space mapping fault (gpu_vaspace.c / dmaAllocMapping / NVKMS GEM-allocation failures), which can storm for minutes and end in a freeze without the GPU ever "falling off the bus". Also flag when the open kernel module (nvidia-*-open) is loaded — the context behind these faults — and add an AI-knowledge entry so the assistant distinguishes it from the Xid 79 hardware drop. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,14 @@ ENTRIES: list[tuple[tuple[str, ...], str]] = [
|
|||||||
(("xid 8", "xid 62", "xid 63", "xid 64"),
|
(("xid 8", "xid 62", "xid 63", "xid 64"),
|
||||||
"These Xid codes commonly indicate VRAM/ECC or memory-training problems — suspect failing "
|
"These Xid codes commonly indicate VRAM/ECC or memory-training problems — suspect failing "
|
||||||
"VRAM or an unstable memory overclock."),
|
"VRAM or an unstable memory overclock."),
|
||||||
|
(("va-space mapping", "gpu_vaspace", "dmaallocmapping", "nvkms memory for gem",
|
||||||
|
"open kernel module", "nvidia open"),
|
||||||
|
"NVIDIA open-kernel-module VA-space mapping errors (gpu_vaspace.c / dmaAllocMapping / "
|
||||||
|
"'Failed to allocate NVKMS memory for GEM object') are a driver-internal fault on the open "
|
||||||
|
"module (nvidia-*-open). They can storm for minutes and end in a HARD FREEZE with NO Xid "
|
||||||
|
"logged — so the GPU never 'falls off the bus', and this is distinct from the Xid 79 "
|
||||||
|
"hardware drop. Fix path: switch from the open to the proprietary NVIDIA kernel module and "
|
||||||
|
"update to the latest driver branch."),
|
||||||
(("smart 197", "current_pending_sector", "pending sector"),
|
(("smart 197", "current_pending_sector", "pending sector"),
|
||||||
"SMART 197 (Current Pending Sector) > 0 = sectors the drive can't read and is waiting to "
|
"SMART 197 (Current Pending Sector) > 0 = sectors the drive can't read and is waiting to "
|
||||||
"reallocate — early sign of a failing disk. Back up now and run an extended self-test."),
|
"reallocate — early sign of a failing disk. Back up now and run an extended self-test."),
|
||||||
|
|||||||
@@ -116,6 +116,31 @@ def scan_journal_text(text: str) -> list[Finding]:
|
|||||||
"Check power/thermals/driver; capture a session with `rigdoctor record`.",
|
"Check power/thermals/driver; capture a session with `rigdoctor record`.",
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# NVIDIA open-kernel-module VA-space mapping faults: a driver-internal failure that can
|
||||||
|
# storm for minutes and end in a HARD FREEZE with NO Xid logged — the GPU never "falls off
|
||||||
|
# the bus", so the Xid scan above misses it entirely. These code paths live in the open
|
||||||
|
# kernel module (nvidia-*-open); the proprietary module doesn't hit them.
|
||||||
|
nvrm_va = [
|
||||||
|
ln for ln in lines
|
||||||
|
if "gpu_vaspace.c" in ln
|
||||||
|
or "_gvaspaceMappingInsert" in ln
|
||||||
|
or "dmaAllocMapping" in ln
|
||||||
|
or "NVKMS memory for GEM object" in ln
|
||||||
|
]
|
||||||
|
if nvrm_va:
|
||||||
|
findings.append(Finding(
|
||||||
|
WARNING, "GPU", f"NVIDIA driver VA-space mapping errors ×{len(nvrm_va)}",
|
||||||
|
"The NVIDIA kernel module repeatedly failed to update the GPU's virtual address "
|
||||||
|
"space (gpu_vaspace / dmaAllocMapping assertions, NVKMS GEM-allocation failures). "
|
||||||
|
"This is a driver-internal fault that can recur for minutes and end in a hard freeze "
|
||||||
|
"with NO Xid logged — distinct from an Xid 79 hardware drop. These code paths are "
|
||||||
|
"specific to the open kernel module (nvidia-*-open).",
|
||||||
|
"If you're on the open module, switch to the proprietary NVIDIA driver "
|
||||||
|
"(install `nvidia-driver-###` instead of the `…-open` variant) and update to the "
|
||||||
|
"latest branch, then reboot. Capture a session with `rigdoctor record` to confirm "
|
||||||
|
"the errors precede the freeze.",
|
||||||
|
))
|
||||||
|
|
||||||
return findings
|
return findings
|
||||||
|
|
||||||
|
|
||||||
@@ -188,6 +213,53 @@ def check_nvidia_driver() -> list[Finding]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text(path: str) -> str | None:
|
||||||
|
try:
|
||||||
|
return Path(path).read_text()
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _nvidia_module_is_open() -> bool | None:
|
||||||
|
"""Whether the *loaded* NVIDIA kernel module is the open-source flavor.
|
||||||
|
|
||||||
|
True = open (nvidia-*-open), False = proprietary, None = can't tell / no NVIDIA module.
|
||||||
|
/proc is authoritative for the loaded module and needs no external tool; modinfo's filename
|
||||||
|
(…/nvidia-###-open/nvidia.ko) is the fallback.
|
||||||
|
"""
|
||||||
|
proc = _read_text("/proc/driver/nvidia/version")
|
||||||
|
if proc:
|
||||||
|
low = proc.lower()
|
||||||
|
if "open kernel module" in low:
|
||||||
|
return True
|
||||||
|
if "kernel module" in low: # proprietary banner: "NVIDIA UNIX … Kernel Module …"
|
||||||
|
return False
|
||||||
|
if shutil.which("modinfo"):
|
||||||
|
try:
|
||||||
|
out = subprocess.run(["modinfo", "nvidia"], capture_output=True, text=True, timeout=10).stdout
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
out = ""
|
||||||
|
for line in out.splitlines():
|
||||||
|
if line.startswith("filename:"):
|
||||||
|
return "-open" in line
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_nvidia_module() -> list[Finding]:
|
||||||
|
"""Note when the open-source NVIDIA kernel module is loaded — the context behind the no-Xid
|
||||||
|
VA-space freeze signature, which lives in the open module's code paths (suggestion-only)."""
|
||||||
|
if _nvidia_module_is_open() is not True:
|
||||||
|
return []
|
||||||
|
return [Finding(
|
||||||
|
INFO, "Driver", "NVIDIA open kernel module in use",
|
||||||
|
"The loaded NVIDIA driver is the open-source kernel module (nvidia-*-open). It's fine for "
|
||||||
|
"most setups, but on some GeForce cards it hits driver-internal faults (VA-space mapping "
|
||||||
|
"errors, hard freezes with no Xid) that the proprietary module doesn't.",
|
||||||
|
"If you get unexplained hard freezes with no Xid in the logs, try the proprietary NVIDIA "
|
||||||
|
"driver (`nvidia-driver-###` rather than the `…-open` variant) on the latest branch.",
|
||||||
|
)]
|
||||||
|
|
||||||
|
|
||||||
def _smart_devices() -> list[str]:
|
def _smart_devices() -> list[str]:
|
||||||
try:
|
try:
|
||||||
proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10)
|
proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10)
|
||||||
@@ -336,6 +408,7 @@ def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
|||||||
|
|
||||||
findings: list[Finding] = []
|
findings: list[Finding] = []
|
||||||
findings += check_nvidia_driver()
|
findings += check_nvidia_driver()
|
||||||
|
findings += check_nvidia_module()
|
||||||
if include_journal:
|
if include_journal:
|
||||||
findings += check_journal()
|
findings += check_journal()
|
||||||
findings += check_journal_persistence()
|
findings += check_journal_persistence()
|
||||||
|
|||||||
@@ -11,11 +11,19 @@ from rigdoctor.core.health import (
|
|||||||
WARNING,
|
WARNING,
|
||||||
check_displays,
|
check_displays,
|
||||||
check_memory_speed,
|
check_memory_speed,
|
||||||
|
check_nvidia_module,
|
||||||
check_pcie_links,
|
check_pcie_links,
|
||||||
run_health_checks,
|
run_health_checks,
|
||||||
scan_journal_text,
|
scan_journal_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# A real no-Xid freeze: the open-module VA-space storm captured on 2026-05-29.
|
||||||
|
_VASPACE_LOG = """\
|
||||||
|
NVRM: nvCheckFailedNoLog: Check failed: 0 == (pMapNode->gpuMask & gpuMask) @ gpu_vaspace.c:4547
|
||||||
|
NVRM: dmaAllocMapping_GM107: can't update VA space for mapping @vaddr=0x4be00000
|
||||||
|
[drm:nv_drm_gem_alloc_nvkms_memory_ioctl [nvidia_drm]] *ERROR* Failed to allocate NVKMS memory for GEM object
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class HealthScanTests(unittest.TestCase):
|
class HealthScanTests(unittest.TestCase):
|
||||||
def test_xid_79_is_critical(self):
|
def test_xid_79_is_critical(self):
|
||||||
@@ -44,6 +52,28 @@ class HealthScanTests(unittest.TestCase):
|
|||||||
def test_clean_text_yields_no_findings(self):
|
def test_clean_text_yields_no_findings(self):
|
||||||
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
|
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
|
||||||
|
|
||||||
|
def test_vaspace_freeze_detected_without_any_xid(self):
|
||||||
|
findings = scan_journal_text(_VASPACE_LOG)
|
||||||
|
gpu = [f for f in findings if f.category == "GPU"]
|
||||||
|
self.assertEqual(len(gpu), 1)
|
||||||
|
self.assertEqual(gpu[0].severity, WARNING)
|
||||||
|
self.assertIn("VA-space", gpu[0].title)
|
||||||
|
# It must NOT be misreported as an Xid finding (the log has no Xid at all).
|
||||||
|
self.assertNotIn("Xid", gpu[0].title)
|
||||||
|
self.assertIn("open kernel module", gpu[0].detail.lower())
|
||||||
|
|
||||||
|
def test_open_module_finding_when_open_loaded(self):
|
||||||
|
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=True):
|
||||||
|
findings = check_nvidia_module()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, INFO)
|
||||||
|
self.assertEqual(findings[0].category, "Driver")
|
||||||
|
|
||||||
|
def test_no_module_finding_when_proprietary_or_absent(self):
|
||||||
|
for state in (False, None):
|
||||||
|
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=state):
|
||||||
|
self.assertEqual(check_nvidia_module(), [])
|
||||||
|
|
||||||
def test_run_health_checks_returns_findings(self):
|
def test_run_health_checks_returns_findings(self):
|
||||||
# Runs against the real system; just assert it returns a sorted list of Findings.
|
# Runs against the real system; just assert it returns a sorted list of Findings.
|
||||||
findings = run_health_checks()
|
findings = run_health_checks()
|
||||||
|
|||||||
Reference in New Issue
Block a user