From b1bc961b792dadbad5a3500f960415982d2c0d6f Mon Sep 17 00:00:00 2001 From: Jessey van Offeren Date: Fri, 29 May 2026 16:07:14 +0200 Subject: [PATCH] feat(health): detect no-Xid GPU freezes (open-module VA-space faults) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel-log scanner only caught Xid codes, OOM, panic, MCE, AER, thermal, and amdgpu resets — so a hard freeze that logs NO Xid slipped through entirely. Add detection for the NVIDIA open-kernel-module VA-space mapping fault (gpu_vaspace.c / dmaAllocMapping / NVKMS GEM-allocation failures), which can storm for minutes and end in a freeze without the GPU ever "falling off the bus". Also flag when the open kernel module (nvidia-*-open) is loaded — the context behind these faults — and add an AI-knowledge entry so the assistant distinguishes it from the Xid 79 hardware drop. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/rigdoctor/core/ai_knowledge.py | 8 ++++ src/rigdoctor/core/health.py | 73 ++++++++++++++++++++++++++++++ tests/test_health.py | 30 ++++++++++++ 3 files changed, 111 insertions(+) diff --git a/src/rigdoctor/core/ai_knowledge.py b/src/rigdoctor/core/ai_knowledge.py index 336fb1d..799c145 100644 --- a/src/rigdoctor/core/ai_knowledge.py +++ b/src/rigdoctor/core/ai_knowledge.py @@ -30,6 +30,14 @@ ENTRIES: list[tuple[tuple[str, ...], str]] = [ (("xid 8", "xid 62", "xid 63", "xid 64"), "These Xid codes commonly indicate VRAM/ECC or memory-training problems — suspect failing " "VRAM or an unstable memory overclock."), + (("va-space mapping", "gpu_vaspace", "dmaallocmapping", "nvkms memory for gem", + "open kernel module", "nvidia open"), + "NVIDIA open-kernel-module VA-space mapping errors (gpu_vaspace.c / dmaAllocMapping / " + "'Failed to allocate NVKMS memory for GEM object') are a driver-internal fault on the open " + "module (nvidia-*-open). They can storm for minutes and end in a HARD FREEZE with NO Xid " + "logged — so the GPU never 'falls off the bus', and this is distinct from the Xid 79 " + "hardware drop. Fix path: switch from the open to the proprietary NVIDIA kernel module and " + "update to the latest driver branch."), (("smart 197", "current_pending_sector", "pending sector"), "SMART 197 (Current Pending Sector) > 0 = sectors the drive can't read and is waiting to " "reallocate — early sign of a failing disk. Back up now and run an extended self-test."), diff --git a/src/rigdoctor/core/health.py b/src/rigdoctor/core/health.py index 9be430c..efadf2d 100644 --- a/src/rigdoctor/core/health.py +++ b/src/rigdoctor/core/health.py @@ -116,6 +116,31 @@ def scan_journal_text(text: str) -> list[Finding]: "Check power/thermals/driver; capture a session with `rigdoctor record`.", )) + # NVIDIA open-kernel-module VA-space mapping faults: a driver-internal failure that can + # storm for minutes and end in a HARD FREEZE with NO Xid logged — the GPU never "falls off + # the bus", so the Xid scan above misses it entirely. These code paths live in the open + # kernel module (nvidia-*-open); the proprietary module doesn't hit them. + nvrm_va = [ + ln for ln in lines + if "gpu_vaspace.c" in ln + or "_gvaspaceMappingInsert" in ln + or "dmaAllocMapping" in ln + or "NVKMS memory for GEM object" in ln + ] + if nvrm_va: + findings.append(Finding( + WARNING, "GPU", f"NVIDIA driver VA-space mapping errors ×{len(nvrm_va)}", + "The NVIDIA kernel module repeatedly failed to update the GPU's virtual address " + "space (gpu_vaspace / dmaAllocMapping assertions, NVKMS GEM-allocation failures). " + "This is a driver-internal fault that can recur for minutes and end in a hard freeze " + "with NO Xid logged — distinct from an Xid 79 hardware drop. These code paths are " + "specific to the open kernel module (nvidia-*-open).", + "If you're on the open module, switch to the proprietary NVIDIA driver " + "(install `nvidia-driver-###` instead of the `…-open` variant) and update to the " + "latest branch, then reboot. Capture a session with `rigdoctor record` to confirm " + "the errors precede the freeze.", + )) + return findings @@ -188,6 +213,53 @@ def check_nvidia_driver() -> list[Finding]: return [] +def _read_text(path: str) -> str | None: + try: + return Path(path).read_text() + except OSError: + return None + + +def _nvidia_module_is_open() -> bool | None: + """Whether the *loaded* NVIDIA kernel module is the open-source flavor. + + True = open (nvidia-*-open), False = proprietary, None = can't tell / no NVIDIA module. + /proc is authoritative for the loaded module and needs no external tool; modinfo's filename + (…/nvidia-###-open/nvidia.ko) is the fallback. + """ + proc = _read_text("/proc/driver/nvidia/version") + if proc: + low = proc.lower() + if "open kernel module" in low: + return True + if "kernel module" in low: # proprietary banner: "NVIDIA UNIX … Kernel Module …" + return False + if shutil.which("modinfo"): + try: + out = subprocess.run(["modinfo", "nvidia"], capture_output=True, text=True, timeout=10).stdout + except (subprocess.SubprocessError, OSError): + out = "" + for line in out.splitlines(): + if line.startswith("filename:"): + return "-open" in line + return None + + +def check_nvidia_module() -> list[Finding]: + """Note when the open-source NVIDIA kernel module is loaded — the context behind the no-Xid + VA-space freeze signature, which lives in the open module's code paths (suggestion-only).""" + if _nvidia_module_is_open() is not True: + return [] + return [Finding( + INFO, "Driver", "NVIDIA open kernel module in use", + "The loaded NVIDIA driver is the open-source kernel module (nvidia-*-open). It's fine for " + "most setups, but on some GeForce cards it hits driver-internal faults (VA-space mapping " + "errors, hard freezes with no Xid) that the proprietary module doesn't.", + "If you get unexplained hard freezes with no Xid in the logs, try the proprietary NVIDIA " + "driver (`nvidia-driver-###` rather than the `…-open` variant) on the latest branch.", + )] + + def _smart_devices() -> list[str]: try: proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10) @@ -336,6 +408,7 @@ def run_health_checks(include_journal: bool = True) -> list[Finding]: findings: list[Finding] = [] findings += check_nvidia_driver() + findings += check_nvidia_module() if include_journal: findings += check_journal() findings += check_journal_persistence() diff --git a/tests/test_health.py b/tests/test_health.py index 4d6078d..a5d19e2 100644 --- a/tests/test_health.py +++ b/tests/test_health.py @@ -11,11 +11,19 @@ from rigdoctor.core.health import ( WARNING, check_displays, check_memory_speed, + check_nvidia_module, check_pcie_links, run_health_checks, scan_journal_text, ) +# A real no-Xid freeze: the open-module VA-space storm captured on 2026-05-29. +_VASPACE_LOG = """\ +NVRM: nvCheckFailedNoLog: Check failed: 0 == (pMapNode->gpuMask & gpuMask) @ gpu_vaspace.c:4547 +NVRM: dmaAllocMapping_GM107: can't update VA space for mapping @vaddr=0x4be00000 +[drm:nv_drm_gem_alloc_nvkms_memory_ioctl [nvidia_drm]] *ERROR* Failed to allocate NVKMS memory for GEM object +""" + class HealthScanTests(unittest.TestCase): def test_xid_79_is_critical(self): @@ -44,6 +52,28 @@ class HealthScanTests(unittest.TestCase): def test_clean_text_yields_no_findings(self): self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), []) + def test_vaspace_freeze_detected_without_any_xid(self): + findings = scan_journal_text(_VASPACE_LOG) + gpu = [f for f in findings if f.category == "GPU"] + self.assertEqual(len(gpu), 1) + self.assertEqual(gpu[0].severity, WARNING) + self.assertIn("VA-space", gpu[0].title) + # It must NOT be misreported as an Xid finding (the log has no Xid at all). + self.assertNotIn("Xid", gpu[0].title) + self.assertIn("open kernel module", gpu[0].detail.lower()) + + def test_open_module_finding_when_open_loaded(self): + with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=True): + findings = check_nvidia_module() + self.assertEqual(len(findings), 1) + self.assertEqual(findings[0].severity, INFO) + self.assertEqual(findings[0].category, "Driver") + + def test_no_module_finding_when_proprietary_or_absent(self): + for state in (False, None): + with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=state): + self.assertEqual(check_nvidia_module(), []) + def test_run_health_checks_returns_findings(self): # Runs against the real system; just assert it returns a sorted list of Findings. findings = run_health_checks()