305c88ba09
A focused capture that ends without a clean stop (no session-stop, no live recorder) is treated as a likely hard freeze. - core/diagnostic.py: pending_crash() detects the unterminated session; acknowledge_crash() dismisses it; analyze_crash() combines the captured window (final readings + GPU-lost) with a focused scan of the PREVIOUS (crashed) boot + SMART/driver/persistence/temps. - health.check_previous_boot() scans `journalctl -k -b -1`; run_health_checks gained include_journal to avoid double-scanning for the crash path. - GUI: Games page shows a warning banner on launch for an interrupted diagnostic with Analyze crash / Dismiss → results dialog. - Tests for crash detection / clean-stop / acknowledge / in-progress. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
278 lines
11 KiB
Python
278 lines
11 KiB
Python
"""Health report (M4): scan kernel logs + SMART + driver/library state into a
|
||
prioritized, plain-language findings list with suggested fixes (read-only, D9).
|
||
|
||
Stdlib-only. Every check degrades gracefully — a missing tool/permission yields an
|
||
info finding, never an exception.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
|
||
CRITICAL = "critical"
|
||
WARNING = "warning"
|
||
INFO = "info"
|
||
OK = "ok"
|
||
_ORDER = {CRITICAL: 0, WARNING: 1, INFO: 2, OK: 3}
|
||
|
||
|
||
@dataclass
|
||
class Finding:
|
||
severity: str # critical | warning | info | ok
|
||
category: str # GPU, Kernel, Memory, Storage, Thermal, Driver, PCIe, Logs
|
||
title: str
|
||
detail: str = ""
|
||
suggestion: str = ""
|
||
action: str = "" # optional: id of an installable catalog component (for an Install button)
|
||
fix: str = "" # optional: id of an applyable runtime tunable (for an Apply dropdown, M6)
|
||
|
||
|
||
# --- NVIDIA Xid knowledge (the seed crash is Xid 79) --------------------------
|
||
_XID_INFO: dict[int, tuple[str, str]] = {
|
||
13: (WARNING, "Graphics engine exception (often an app/driver bug or unstable overclock)"),
|
||
31: (WARNING, "GPU memory page fault (usually a driver or application bug)"),
|
||
43: (WARNING, "GPU stopped processing a task (application error)"),
|
||
45: (INFO, "Preemptive channel removal (often a side-effect of another error or a reboot)"),
|
||
48: (CRITICAL, "Double-bit ECC error — VRAM hardware fault"),
|
||
62: (CRITICAL, "Internal microcontroller halt (often follows instability)"),
|
||
79: (CRITICAL, "GPU has fallen off the bus — hardware: power delivery, PCIe link, or thermals"),
|
||
94: (CRITICAL, "Contained ECC error"),
|
||
95: (CRITICAL, "Uncontained ECC error"),
|
||
119: (CRITICAL, "GSP RPC timeout — GPU System Processor hang"),
|
||
120: (CRITICAL, "GSP error — GPU System Processor fault"),
|
||
}
|
||
_XID_SUGGEST: dict[int, str] = {
|
||
79: "Check PSU/power cables and reseat the GPU/riser; test a lower power limit "
|
||
"(`sudo nvidia-smi -pl <watts>`) and capture a session with `rigdoctor record`.",
|
||
48: "Persistent VRAM ECC errors mean failing memory — RMA the card if it recurs.",
|
||
119: "GSP hangs are often driver-version specific — try a different driver branch.",
|
||
120: "GSP errors are often driver-version specific — try a different driver branch.",
|
||
}
|
||
_XID_RE = re.compile(r"Xid(?:\s*\([^)]*\))?:?\s*(\d+)")
|
||
|
||
|
||
def scan_journal_text(text: str) -> list[Finding]:
|
||
"""Parse kernel-log text into findings (separated from IO so it's testable)."""
|
||
lines = text.splitlines()
|
||
findings: list[Finding] = []
|
||
|
||
xids: dict[int, int] = {}
|
||
for line in lines:
|
||
if "Xid" in line:
|
||
m = _XID_RE.search(line)
|
||
if m:
|
||
code = int(m.group(1))
|
||
xids[code] = xids.get(code, 0) + 1
|
||
for code in sorted(xids):
|
||
severity, desc = _XID_INFO.get(code, (WARNING, f"NVIDIA GPU error (Xid {code})"))
|
||
suggest = _XID_SUGGEST.get(code, "Look up this Xid code in NVIDIA's Xid error documentation.")
|
||
findings.append(Finding(severity, "GPU", f"NVIDIA Xid {code} ×{xids[code]}", desc, suggest))
|
||
|
||
oom = sum(1 for ln in lines if "Out of memory" in ln or "oom-kill" in ln or "oom_reaper" in ln)
|
||
if oom:
|
||
findings.append(Finding(
|
||
WARNING, "Memory", f"Out-of-memory kills ×{oom}",
|
||
"The kernel killed processes to reclaim RAM.",
|
||
"Close memory-heavy apps, add zram/swap, or investigate a leak.",
|
||
))
|
||
|
||
if any("Kernel panic" in ln for ln in lines):
|
||
findings.append(Finding(
|
||
CRITICAL, "Kernel", "Kernel panic recorded",
|
||
"The kernel hit an unrecoverable error.",
|
||
"Note the panic message; review recent driver/kernel updates and hardware.",
|
||
))
|
||
|
||
if any("mce:" in ln or "Machine check" in ln or "Hardware Error" in ln for ln in lines):
|
||
findings.append(Finding(
|
||
CRITICAL, "Hardware", "Machine Check Exception (MCE)",
|
||
"The CPU reported a hardware error.",
|
||
"Run memtest86 for RAM, check CPU temps/voltages, and review the MCE detail.",
|
||
))
|
||
|
||
if any("AER:" in ln or "PCIe Bus Error" in ln or ("pcieport" in ln and "error" in ln.lower()) for ln in lines):
|
||
findings.append(Finding(
|
||
WARNING, "PCIe", "PCIe bus errors (AER)",
|
||
"Correctable/uncorrectable PCIe errors were logged.",
|
||
"Reseat the device and check risers/cabling; AER storms can precede a GPU drop.",
|
||
))
|
||
|
||
low = [ln.lower() for ln in lines]
|
||
if any(("thermal" in ln and ("critical" in ln or "throttl" in ln)) or "temperature above threshold" in ln for ln in low):
|
||
findings.append(Finding(
|
||
WARNING, "Thermal", "Thermal events logged",
|
||
"The system logged thermal throttling / critical-temperature events.",
|
||
"Improve airflow/cooling and check fan curves; watch live temps on the dashboard.",
|
||
))
|
||
|
||
if any("amdgpu" in ln and "reset" in ln for ln in low):
|
||
findings.append(Finding(
|
||
CRITICAL, "GPU", "AMD GPU reset (amdgpu)",
|
||
"The AMD GPU was reset after a hang.",
|
||
"Check power/thermals/driver; capture a session with `rigdoctor record`.",
|
||
))
|
||
|
||
return findings
|
||
|
||
|
||
def _journalctl(args: list[str]) -> str | None:
|
||
if shutil.which("journalctl") is None:
|
||
return None
|
||
try:
|
||
proc = subprocess.run(["journalctl", *args], capture_output=True, text=True, timeout=25)
|
||
return proc.stdout
|
||
except (subprocess.SubprocessError, OSError):
|
||
return None
|
||
|
||
|
||
def check_journal() -> list[Finding]:
|
||
out = _journalctl(["-k", "--no-pager", "-o", "cat", "--since", "-7 days"])
|
||
if out is None:
|
||
return [Finding(
|
||
INFO, "Logs", "Couldn't read the kernel journal",
|
||
"journalctl is unavailable or not readable.",
|
||
"Ensure systemd/journald is present and your user is in the 'systemd-journal' or 'adm' group.",
|
||
)]
|
||
findings = scan_journal_text(out)
|
||
if not findings:
|
||
findings.append(Finding(
|
||
OK, "Logs", "No notable kernel errors (last 7 days)",
|
||
"No Xid, panic, OOM, MCE, PCIe AER, or thermal events found.",
|
||
))
|
||
return findings
|
||
|
||
|
||
def check_previous_boot() -> list[Finding]:
|
||
"""Scan the previous boot's kernel log — the boot that crashed — for fault signatures.
|
||
|
||
Needs persistent journald (else the crashed boot's logs were lost on reboot, which the
|
||
persistence check flags separately). Findings are framed as coming from that boot.
|
||
"""
|
||
out = _journalctl(["-k", "-b", "-1", "--no-pager", "-o", "cat"])
|
||
if not out or not out.strip():
|
||
return []
|
||
tagged = []
|
||
for f in scan_journal_text(out):
|
||
detail = ("Logged during the previous (crashed) boot. " + (f.detail or "")).strip()
|
||
tagged.append(Finding(f.severity, f.category, f.title, detail, f.suggestion))
|
||
return tagged
|
||
|
||
|
||
def check_journal_persistence() -> list[Finding]:
|
||
if Path("/var/log/journal").is_dir():
|
||
return []
|
||
return [Finding(
|
||
WARNING, "Logs", "journald isn't persistent across reboots",
|
||
"Crash-boot kernel logs are discarded on reboot, so a hard freeze's evidence can vanish.",
|
||
"Enable persistent logging: `sudo mkdir -p /var/log/journal && sudo systemctl restart systemd-journald`",
|
||
)]
|
||
|
||
|
||
def check_nvidia_driver() -> list[Finding]:
|
||
if shutil.which("nvidia-smi") is None:
|
||
return []
|
||
try:
|
||
proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, timeout=10)
|
||
except (subprocess.SubprocessError, OSError):
|
||
return []
|
||
if "Driver/library version mismatch" in (proc.stdout + proc.stderr):
|
||
return [Finding(
|
||
CRITICAL, "Driver", "NVIDIA driver/library version mismatch",
|
||
"The loaded kernel module and the userspace NVIDIA libraries differ — GPU monitoring will fail until resolved.",
|
||
"Reboot to load the matching module (or finish the interrupted driver update).",
|
||
)]
|
||
return []
|
||
|
||
|
||
def _smart_devices() -> list[str]:
|
||
try:
|
||
proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10)
|
||
except (subprocess.SubprocessError, OSError):
|
||
return []
|
||
devices = []
|
||
for line in proc.stdout.splitlines():
|
||
line = line.strip()
|
||
if line.startswith("/dev/"):
|
||
devices.append(line.split()[0])
|
||
return devices
|
||
|
||
|
||
def check_smart() -> list[Finding]:
|
||
if shutil.which("smartctl") is None:
|
||
return [Finding(
|
||
INFO, "Storage", "SMART not checked (smartmontools missing)",
|
||
"Disk self-health couldn't be read.",
|
||
"Install it for disk health checks: `sudo apt install smartmontools`",
|
||
)]
|
||
devices = _smart_devices()
|
||
if not devices:
|
||
return [Finding(
|
||
INFO, "Storage", "SMART: couldn't enumerate drives",
|
||
"Reading SMART usually needs root.",
|
||
"Run: `sudo rigdoctor report`",
|
||
)]
|
||
findings: list[Finding] = []
|
||
for dev in devices:
|
||
try:
|
||
proc = subprocess.run(["smartctl", "-H", dev], capture_output=True, text=True, timeout=15)
|
||
except (subprocess.SubprocessError, OSError):
|
||
continue
|
||
combined = proc.stdout + proc.stderr
|
||
if "Permission denied" in combined or "requires root" in combined.lower():
|
||
findings.append(Finding(INFO, "Storage", f"SMART for {dev} needs root", "", "Run: `sudo rigdoctor report`"))
|
||
elif "PASSED" in combined:
|
||
findings.append(Finding(OK, "Storage", f"SMART OK: {dev}", "Overall-health self-assessment passed."))
|
||
elif "FAILED" in combined or "FAILING_NOW" in combined:
|
||
findings.append(Finding(CRITICAL, "Storage", f"SMART FAILED: {dev}", "The drive reports failing health.", "Back up now and replace the drive."))
|
||
return findings
|
||
|
||
|
||
def check_live_temps() -> list[Finding]:
|
||
from .sampler import Sampler
|
||
from .sources import available_sources
|
||
|
||
sample = Sampler(available_sources()).sample()
|
||
hot = [
|
||
(r.source, r.label or r.metric, r.value)
|
||
for r in sample.readings
|
||
if r.unit == "°C" and r.value is not None and r.value >= 90
|
||
]
|
||
if not hot:
|
||
return []
|
||
worst = max(hot, key=lambda x: x[2])
|
||
detail = "; ".join(f"{s} {label} {v:.0f}°C" for s, label, v in hot)
|
||
return [Finding(
|
||
WARNING, "Thermal", f"High temperature right now ({worst[2]:.0f}°C)",
|
||
detail, "Check cooling/airflow and reduce load.",
|
||
)]
|
||
|
||
|
||
def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
||
"""Run all checks and return findings sorted by severity (worst first).
|
||
|
||
SMART needs root; if the session collected it via launch elevation, use that
|
||
instead of re-running smartctl (which would just report "needs root").
|
||
|
||
`include_journal=False` skips the 7-day kernel-journal scan — used by the crash
|
||
analysis, which scans the previous (crashed) boot specifically instead.
|
||
"""
|
||
from . import elevation
|
||
|
||
findings: list[Finding] = []
|
||
findings += check_nvidia_driver()
|
||
if include_journal:
|
||
findings += check_journal()
|
||
findings += check_journal_persistence()
|
||
priv = elevation.privileged()
|
||
if priv is not None and priv.get("smart") is not None:
|
||
findings += [Finding(**d) for d in priv["smart"]]
|
||
else:
|
||
findings += check_smart()
|
||
findings += check_live_temps()
|
||
findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
|
||
return findings
|