"""Health report (M4): scan kernel logs + SMART + driver/library state into a prioritized, plain-language findings list with suggested fixes (read-only, D9). Stdlib-only. Every check degrades gracefully — a missing tool/permission yields an info finding, never an exception. """ from __future__ import annotations import re import shutil import subprocess from dataclasses import dataclass from pathlib import Path CRITICAL = "critical" WARNING = "warning" INFO = "info" OK = "ok" _ORDER = {CRITICAL: 0, WARNING: 1, INFO: 2, OK: 3} @dataclass class Finding: severity: str # critical | warning | info | ok category: str # GPU, Kernel, Memory, Storage, Thermal, Driver, PCIe, Logs title: str detail: str = "" suggestion: str = "" action: str = "" # optional: id of an installable catalog component (for an Install button) fix: str = "" # optional: id of an applyable runtime tunable (for an Apply dropdown, M6) # --- NVIDIA Xid knowledge (the seed crash is Xid 79) -------------------------- _XID_INFO: dict[int, tuple[str, str]] = { 13: (WARNING, "Graphics engine exception (often an app/driver bug or unstable overclock)"), 31: (WARNING, "GPU memory page fault (usually a driver or application bug)"), 43: (WARNING, "GPU stopped processing a task (application error)"), 45: (INFO, "Preemptive channel removal (often a side-effect of another error or a reboot)"), 48: (CRITICAL, "Double-bit ECC error — VRAM hardware fault"), 62: (CRITICAL, "Internal microcontroller halt (often follows instability)"), 79: (CRITICAL, "GPU has fallen off the bus — hardware: power delivery, PCIe link, or thermals"), 94: (CRITICAL, "Contained ECC error"), 95: (CRITICAL, "Uncontained ECC error"), 119: (CRITICAL, "GSP RPC timeout — GPU System Processor hang"), 120: (CRITICAL, "GSP error — GPU System Processor fault"), } _XID_SUGGEST: dict[int, str] = { 79: "Check PSU/power cables and reseat the GPU/riser; test a lower power limit " "(`sudo nvidia-smi -pl `) and capture a session with `rigdoctor record`.", 48: "Persistent VRAM ECC errors mean failing memory — RMA the card if it recurs.", 119: "GSP hangs are often driver-version specific — try a different driver branch.", 120: "GSP errors are often driver-version specific — try a different driver branch.", } _XID_RE = re.compile(r"Xid(?:\s*\([^)]*\))?:?\s*(\d+)") def scan_journal_text(text: str) -> list[Finding]: """Parse kernel-log text into findings (separated from IO so it's testable).""" lines = text.splitlines() findings: list[Finding] = [] xids: dict[int, int] = {} for line in lines: if "Xid" in line: m = _XID_RE.search(line) if m: code = int(m.group(1)) xids[code] = xids.get(code, 0) + 1 for code in sorted(xids): severity, desc = _XID_INFO.get(code, (WARNING, f"NVIDIA GPU error (Xid {code})")) suggest = _XID_SUGGEST.get(code, "Look up this Xid code in NVIDIA's Xid error documentation.") findings.append(Finding(severity, "GPU", f"NVIDIA Xid {code} ×{xids[code]}", desc, suggest)) oom = sum(1 for ln in lines if "Out of memory" in ln or "oom-kill" in ln or "oom_reaper" in ln) if oom: findings.append(Finding( WARNING, "Memory", f"Out-of-memory kills ×{oom}", "The kernel killed processes to reclaim RAM.", "Close memory-heavy apps, add zram/swap, or investigate a leak.", )) if any("Kernel panic" in ln for ln in lines): findings.append(Finding( CRITICAL, "Kernel", "Kernel panic recorded", "The kernel hit an unrecoverable error.", "Note the panic message; review recent driver/kernel updates and hardware.", )) if any("mce:" in ln or "Machine check" in ln or "Hardware Error" in ln for ln in lines): findings.append(Finding( CRITICAL, "Hardware", "Machine Check Exception (MCE)", "The CPU reported a hardware error.", "Run memtest86 for RAM, check CPU temps/voltages, and review the MCE detail.", )) if any("AER:" in ln or "PCIe Bus Error" in ln or ("pcieport" in ln and "error" in ln.lower()) for ln in lines): findings.append(Finding( WARNING, "PCIe", "PCIe bus errors (AER)", "Correctable/uncorrectable PCIe errors were logged.", "Reseat the device and check risers/cabling; AER storms can precede a GPU drop.", )) low = [ln.lower() for ln in lines] if any(("thermal" in ln and ("critical" in ln or "throttl" in ln)) or "temperature above threshold" in ln for ln in low): findings.append(Finding( WARNING, "Thermal", "Thermal events logged", "The system logged thermal throttling / critical-temperature events.", "Improve airflow/cooling and check fan curves; watch live temps on the dashboard.", )) if any("amdgpu" in ln and "reset" in ln for ln in low): findings.append(Finding( CRITICAL, "GPU", "AMD GPU reset (amdgpu)", "The AMD GPU was reset after a hang.", "Check power/thermals/driver; capture a session with `rigdoctor record`.", )) return findings def _journalctl(args: list[str]) -> str | None: if shutil.which("journalctl") is None: return None try: proc = subprocess.run(["journalctl", *args], capture_output=True, text=True, timeout=25) return proc.stdout except (subprocess.SubprocessError, OSError): return None def check_journal() -> list[Finding]: out = _journalctl(["-k", "--no-pager", "-o", "cat", "--since", "-7 days"]) if out is None: return [Finding( INFO, "Logs", "Couldn't read the kernel journal", "journalctl is unavailable or not readable.", "Ensure systemd/journald is present and your user is in the 'systemd-journal' or 'adm' group.", )] findings = scan_journal_text(out) if not findings: findings.append(Finding( OK, "Logs", "No notable kernel errors (last 7 days)", "No Xid, panic, OOM, MCE, PCIe AER, or thermal events found.", )) return findings def check_previous_boot() -> list[Finding]: """Scan the previous boot's kernel log — the boot that crashed — for fault signatures. Needs persistent journald (else the crashed boot's logs were lost on reboot, which the persistence check flags separately). Findings are framed as coming from that boot. """ out = _journalctl(["-k", "-b", "-1", "--no-pager", "-o", "cat"]) if not out or not out.strip(): return [] tagged = [] for f in scan_journal_text(out): detail = ("Logged during the previous (crashed) boot. " + (f.detail or "")).strip() tagged.append(Finding(f.severity, f.category, f.title, detail, f.suggestion)) return tagged def check_journal_persistence() -> list[Finding]: if Path("/var/log/journal").is_dir(): return [] return [Finding( WARNING, "Logs", "journald isn't persistent across reboots", "Crash-boot kernel logs are discarded on reboot, so a hard freeze's evidence can vanish.", "Enable persistent logging: `sudo mkdir -p /var/log/journal && sudo systemctl restart systemd-journald`", )] def check_nvidia_driver() -> list[Finding]: if shutil.which("nvidia-smi") is None: return [] try: proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, timeout=10) except (subprocess.SubprocessError, OSError): return [] if "Driver/library version mismatch" in (proc.stdout + proc.stderr): return [Finding( CRITICAL, "Driver", "NVIDIA driver/library version mismatch", "The loaded kernel module and the userspace NVIDIA libraries differ — GPU monitoring will fail until resolved.", "Reboot to load the matching module (or finish the interrupted driver update).", )] return [] def _smart_devices() -> list[str]: try: proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10) except (subprocess.SubprocessError, OSError): return [] devices = [] for line in proc.stdout.splitlines(): line = line.strip() if line.startswith("/dev/"): devices.append(line.split()[0]) return devices def check_smart() -> list[Finding]: if shutil.which("smartctl") is None: return [Finding( INFO, "Storage", "SMART not checked (smartmontools missing)", "Disk self-health couldn't be read.", "Install it for disk health checks: `sudo apt install smartmontools`", )] devices = _smart_devices() if not devices: return [Finding( INFO, "Storage", "SMART: couldn't enumerate drives", "Reading SMART usually needs root.", "Run: `sudo rigdoctor report`", )] findings: list[Finding] = [] for dev in devices: try: proc = subprocess.run(["smartctl", "-H", dev], capture_output=True, text=True, timeout=15) except (subprocess.SubprocessError, OSError): continue combined = proc.stdout + proc.stderr if "Permission denied" in combined or "requires root" in combined.lower(): findings.append(Finding(INFO, "Storage", f"SMART for {dev} needs root", "", "Run: `sudo rigdoctor report`")) elif "PASSED" in combined: findings.append(Finding(OK, "Storage", f"SMART OK: {dev}", "Overall-health self-assessment passed.")) elif "FAILED" in combined or "FAILING_NOW" in combined: findings.append(Finding(CRITICAL, "Storage", f"SMART FAILED: {dev}", "The drive reports failing health.", "Back up now and replace the drive.")) return findings def check_live_temps() -> list[Finding]: from .sampler import Sampler from .sources import available_sources sample = Sampler(available_sources()).sample() hot = [ (r.source, r.label or r.metric, r.value) for r in sample.readings if r.unit == "°C" and r.value is not None and r.value >= 90 ] if not hot: return [] worst = max(hot, key=lambda x: x[2]) detail = "; ".join(f"{s} {label} {v:.0f}°C" for s, label, v in hot) return [Finding( WARNING, "Thermal", f"High temperature right now ({worst[2]:.0f}°C)", detail, "Check cooling/airflow and reduce load.", )] def run_health_checks(include_journal: bool = True) -> list[Finding]: """Run all checks and return findings sorted by severity (worst first). SMART needs root; if the session collected it via launch elevation, use that instead of re-running smartctl (which would just report "needs root"). `include_journal=False` skips the 7-day kernel-journal scan — used by the crash analysis, which scans the previous (crashed) boot specifically instead. """ from . import elevation findings: list[Finding] = [] findings += check_nvidia_driver() if include_journal: findings += check_journal() findings += check_journal_persistence() priv = elevation.privileged() if priv is not None and priv.get("smart") is not None: findings += [Finding(**d) for d in priv["smart"]] else: findings += check_smart() findings += check_live_temps() findings.sort(key=lambda f: _ORDER.get(f.severity, 9)) return findings