feat: detect a hard-crashed diagnostic + analyze the crash boot — 0.15.0
A focused capture that ends without a clean stop (no session-stop, no live recorder) is treated as a likely hard freeze. - core/diagnostic.py: pending_crash() detects the unterminated session; acknowledge_crash() dismisses it; analyze_crash() combines the captured window (final readings + GPU-lost) with a focused scan of the PREVIOUS (crashed) boot + SMART/driver/persistence/temps. - health.check_previous_boot() scans `journalctl -k -b -1`; run_health_checks gained include_journal to avoid double-scanning for the crash path. - GUI: Games page shows a warning banner on launch for an interrupted diagnostic with Analyze crash / Dismiss → results dialog. - Tests for crash detection / clean-stop / acknowledge / in-progress. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,19 @@ All notable changes to RigDoctor are recorded here. Format follows
|
|||||||
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
||||||
release tag (so the auto-updater, D18, can compare versions).
|
release tag (so the auto-updater, D18, can compare versions).
|
||||||
|
|
||||||
|
## [0.15.0] - 2026-05-22
|
||||||
|
### Added
|
||||||
|
- **Hard-crash detection & recovery for the guided diagnostic.** If a focused capture ends
|
||||||
|
without a clean stop (the recorder never wrote `session-stop` and isn't running), RigDoctor
|
||||||
|
treats it as a likely hard freeze. On launch the **Games** page shows a warning banner —
|
||||||
|
*"Your last diagnostic for <game> ended unexpectedly…"* — with **Analyze crash** / **Dismiss**.
|
||||||
|
- **Deeper crash analysis.** *Analyze crash* combines the captured window (final readings before
|
||||||
|
the freeze + any GPU-lost event) with a focused scan of the **previous (crashed) boot's kernel
|
||||||
|
log** (`journalctl -k -b -1`: Xid/panic/OOM/MCE/AER/thermal) plus SMART/driver/persistence/
|
||||||
|
live-temp checks — the full "what happened" picture. `core/diagnostic.py` gains
|
||||||
|
`pending_crash()` / `analyze_crash()`; `health.check_previous_boot()` +
|
||||||
|
`run_health_checks(include_journal=False)` back it.
|
||||||
|
|
||||||
## [0.14.0] - 2026-05-22
|
## [0.14.0] - 2026-05-22
|
||||||
### Changed
|
### Changed
|
||||||
- **Dashboard headline tiles are now history trend graphs** instead of single-value gauges —
|
- **Dashboard headline tiles are now history trend graphs** instead of single-value gauges —
|
||||||
|
|||||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "rigdoctor"
|
name = "rigdoctor"
|
||||||
version = "0.14.0"
|
version = "0.15.0"
|
||||||
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
||||||
|
|
||||||
__version__ = "0.14.0"
|
__version__ = "0.15.0"
|
||||||
|
|||||||
@@ -11,13 +11,16 @@ The capture is **manually bracketed** (start/finish) for now; auto start/stop on
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from .. import config
|
from .. import config
|
||||||
from . import reccontrol
|
from . import reccontrol
|
||||||
from .crashlog import Summary, summarize
|
from .crashlog import Summary, summarize
|
||||||
from .health import Finding
|
from .health import CRITICAL, OK, WARNING, Finding
|
||||||
|
|
||||||
|
_SEV_ORDER = {CRITICAL: 0, WARNING: 1, "info": 2, OK: 3}
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -27,6 +30,14 @@ class DiagnosticResult:
|
|||||||
findings: list[Finding] # health findings: Xid/SMART/driver/etc. (M4)
|
findings: list[Finding] # health findings: Xid/SMART/driver/etc. (M4)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CrashInfo:
|
||||||
|
game: str | None
|
||||||
|
samples: int
|
||||||
|
when: float | None # ts of the last captured sample (≈ when the freeze hit)
|
||||||
|
gpu_lost: bool
|
||||||
|
|
||||||
|
|
||||||
def _clear_diag_log() -> None:
|
def _clear_diag_log() -> None:
|
||||||
"""Each diagnostic is a fresh focused capture — drop any previous session + segments."""
|
"""Each diagnostic is a fresh focused capture — drop any previous session + segments."""
|
||||||
base = config.DIAG_LOG
|
base = config.DIAG_LOG
|
||||||
@@ -82,3 +93,70 @@ def finish(last_n: int = 10, log_path=None) -> DiagnosticResult:
|
|||||||
game = _game_from_summary(summary) or (reccontrol.read_status() or {}).get("game")
|
game = _game_from_summary(summary) or (reccontrol.read_status() or {}).get("game")
|
||||||
findings = run_health_checks()
|
findings = run_health_checks()
|
||||||
return DiagnosticResult(game=game, summary=summary, findings=findings)
|
return DiagnosticResult(game=game, summary=summary, findings=findings)
|
||||||
|
|
||||||
|
|
||||||
|
# --- hard-crash detection & post-crash analysis -----------------------------------
|
||||||
|
|
||||||
|
def pending_crash() -> CrashInfo | None:
|
||||||
|
"""Detect a diagnostic that ended abnormally (no clean stop, no live recorder).
|
||||||
|
|
||||||
|
A focused capture writes `session-start` (+ `game`) and, on a clean stop, `session-stop`.
|
||||||
|
After a hard freeze that block never runs, so the log has a start with no stop and no
|
||||||
|
live recorder — that's our hard-crash signal. Returns None if a capture is running, none
|
||||||
|
is recorded, it stopped cleanly, or the user already acknowledged it.
|
||||||
|
"""
|
||||||
|
if is_running() or not config.DIAG_LOG.exists():
|
||||||
|
return None
|
||||||
|
summary = summarize(config.DIAG_LOG)
|
||||||
|
kinds = {kind for _ts, kind, _detail in summary.events}
|
||||||
|
if "session-start" not in kinds:
|
||||||
|
return None
|
||||||
|
if "session-stop" in kinds or "diagnostic-acknowledged" in kinds:
|
||||||
|
return None
|
||||||
|
return CrashInfo(
|
||||||
|
game=_game_from_summary(summary),
|
||||||
|
samples=summary.samples,
|
||||||
|
when=summary.end,
|
||||||
|
gpu_lost="gpu-lost" in kinds,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def acknowledge_crash() -> None:
|
||||||
|
"""Mark the recorded crash as seen so it stops prompting (appends a marker event)."""
|
||||||
|
try:
|
||||||
|
config.DIAG_LOG.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(config.DIAG_LOG, "a", encoding="utf-8") as fh:
|
||||||
|
fh.write(json.dumps({"ts": time.time(), "event": "diagnostic-acknowledged", "detail": ""}) + "\n")
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _crash_headline(summary: Summary) -> Finding:
|
||||||
|
gpu_lost = any(kind == "gpu-lost" for _ts, kind, _detail in summary.events)
|
||||||
|
when = time.strftime("%H:%M:%S", time.localtime(summary.end)) if summary.end else "?"
|
||||||
|
detail = (
|
||||||
|
f"The capture stopped abruptly at {when} after {summary.samples} samples, with no clean "
|
||||||
|
"shutdown recorded — consistent with a hard freeze or power loss."
|
||||||
|
)
|
||||||
|
if gpu_lost:
|
||||||
|
detail += " A GPU-lost event was captured during the session."
|
||||||
|
return Finding(
|
||||||
|
CRITICAL if gpu_lost else WARNING,
|
||||||
|
"Diagnostic",
|
||||||
|
"Session ended without a clean stop (likely a hard crash)",
|
||||||
|
detail,
|
||||||
|
"Review the last readings (Capture, above) and the crash-boot findings below.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_crash(last_n: int = 15) -> DiagnosticResult:
|
||||||
|
"""Analyze a recorded hard crash: the captured window + the previous boot's kernel log
|
||||||
|
+ the rest of the health report (SMART/driver/persistence/temps)."""
|
||||||
|
from .health import check_previous_boot, run_health_checks
|
||||||
|
|
||||||
|
summary = summarize(config.DIAG_LOG, last_n=last_n)
|
||||||
|
findings: list[Finding] = [_crash_headline(summary)]
|
||||||
|
findings += check_previous_boot() # the crashed boot's kernel log
|
||||||
|
findings += run_health_checks(include_journal=False) # SMART/driver/persistence/temps
|
||||||
|
findings.sort(key=lambda f: _SEV_ORDER.get(f.severity, 9))
|
||||||
|
return DiagnosticResult(game=_game_from_summary(summary), summary=summary, findings=findings)
|
||||||
|
|||||||
@@ -146,6 +146,22 @@ def check_journal() -> list[Finding]:
|
|||||||
return findings
|
return findings
|
||||||
|
|
||||||
|
|
||||||
|
def check_previous_boot() -> list[Finding]:
|
||||||
|
"""Scan the previous boot's kernel log — the boot that crashed — for fault signatures.
|
||||||
|
|
||||||
|
Needs persistent journald (else the crashed boot's logs were lost on reboot, which the
|
||||||
|
persistence check flags separately). Findings are framed as coming from that boot.
|
||||||
|
"""
|
||||||
|
out = _journalctl(["-k", "-b", "-1", "--no-pager", "-o", "cat"])
|
||||||
|
if not out or not out.strip():
|
||||||
|
return []
|
||||||
|
tagged = []
|
||||||
|
for f in scan_journal_text(out):
|
||||||
|
detail = ("Logged during the previous (crashed) boot. " + (f.detail or "")).strip()
|
||||||
|
tagged.append(Finding(f.severity, f.category, f.title, detail, f.suggestion))
|
||||||
|
return tagged
|
||||||
|
|
||||||
|
|
||||||
def check_journal_persistence() -> list[Finding]:
|
def check_journal_persistence() -> list[Finding]:
|
||||||
if Path("/var/log/journal").is_dir():
|
if Path("/var/log/journal").is_dir():
|
||||||
return []
|
return []
|
||||||
@@ -235,17 +251,21 @@ def check_live_temps() -> list[Finding]:
|
|||||||
)]
|
)]
|
||||||
|
|
||||||
|
|
||||||
def run_health_checks() -> list[Finding]:
|
def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
||||||
"""Run all checks and return findings sorted by severity (worst first).
|
"""Run all checks and return findings sorted by severity (worst first).
|
||||||
|
|
||||||
SMART needs root; if the session collected it via launch elevation, use that
|
SMART needs root; if the session collected it via launch elevation, use that
|
||||||
instead of re-running smartctl (which would just report "needs root").
|
instead of re-running smartctl (which would just report "needs root").
|
||||||
|
|
||||||
|
`include_journal=False` skips the 7-day kernel-journal scan — used by the crash
|
||||||
|
analysis, which scans the previous (crashed) boot specifically instead.
|
||||||
"""
|
"""
|
||||||
from . import elevation
|
from . import elevation
|
||||||
|
|
||||||
findings: list[Finding] = []
|
findings: list[Finding] = []
|
||||||
findings += check_nvidia_driver()
|
findings += check_nvidia_driver()
|
||||||
findings += check_journal()
|
if include_journal:
|
||||||
|
findings += check_journal()
|
||||||
findings += check_journal_persistence()
|
findings += check_journal_persistence()
|
||||||
priv = elevation.privileged()
|
priv = elevation.privileged()
|
||||||
if priv is not None and priv.get("smart") is not None:
|
if priv is not None and priv.get("smart") is not None:
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from PySide6.QtWidgets import (
|
|||||||
|
|
||||||
from ..config import load_config, update_config
|
from ..config import load_config, update_config
|
||||||
from .diagnostic_dialog import DiagnosticDialog
|
from .diagnostic_dialog import DiagnosticDialog
|
||||||
from .theme import ACCENT, GOOD, MUTED
|
from .theme import ACCENT, GOOD, MUTED, WARN
|
||||||
|
|
||||||
|
|
||||||
def _game_row(name: str, sublabel: str, size: str, is_new: bool, appid: str = "", on_diagnose=None) -> QFrame:
|
def _game_row(name: str, sublabel: str, size: str, is_new: bool, appid: str = "", on_diagnose=None) -> QFrame:
|
||||||
@@ -126,6 +126,27 @@ class GamesPage(QWidget):
|
|||||||
self._banner.hide()
|
self._banner.hide()
|
||||||
root.addWidget(self._banner)
|
root.addWidget(self._banner)
|
||||||
|
|
||||||
|
# Hard-crash banner: a previous diagnostic ended without a clean stop.
|
||||||
|
self._crash_banner = QFrame()
|
||||||
|
self._crash_banner.setObjectName("Card")
|
||||||
|
self._crash_banner.setStyleSheet(f"#Card {{ border: 1px solid {WARN}; }}")
|
||||||
|
crash_h = QHBoxLayout(self._crash_banner)
|
||||||
|
crash_h.setContentsMargins(16, 10, 16, 10)
|
||||||
|
crash_h.setSpacing(10)
|
||||||
|
self._crash_label = QLabel("")
|
||||||
|
self._crash_label.setWordWrap(True)
|
||||||
|
self._crash_label.setStyleSheet(f"color: {WARN}; font-weight: 700; background: transparent;")
|
||||||
|
crash_h.addWidget(self._crash_label, 1)
|
||||||
|
self._analyze_btn = QPushButton("Analyze crash")
|
||||||
|
self._analyze_btn.setObjectName("ActionButton")
|
||||||
|
self._analyze_btn.clicked.connect(self._analyze_crash)
|
||||||
|
crash_h.addWidget(self._analyze_btn)
|
||||||
|
self._dismiss_btn = QPushButton("Dismiss")
|
||||||
|
self._dismiss_btn.clicked.connect(self._dismiss_crash)
|
||||||
|
crash_h.addWidget(self._dismiss_btn)
|
||||||
|
self._crash_banner.hide()
|
||||||
|
root.addWidget(self._crash_banner)
|
||||||
|
|
||||||
self._diag_timer = QTimer(self)
|
self._diag_timer = QTimer(self)
|
||||||
self._diag_timer.setInterval(1000)
|
self._diag_timer.setInterval(1000)
|
||||||
self._diag_timer.timeout.connect(self._poll_diag)
|
self._diag_timer.timeout.connect(self._poll_diag)
|
||||||
@@ -163,6 +184,7 @@ class GamesPage(QWidget):
|
|||||||
|
|
||||||
self._load_cached() # instant display from the last scan
|
self._load_cached() # instant display from the last scan
|
||||||
QTimer.singleShot(400, self.refresh) # then rescan in the background on launch
|
QTimer.singleShot(400, self.refresh) # then rescan in the background on launch
|
||||||
|
self._check_crash() # surface an interrupted (crashed) diagnostic
|
||||||
|
|
||||||
# --- loading ----------------------------------------------------------------------
|
# --- loading ----------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -357,8 +379,10 @@ class GamesPage(QWidget):
|
|||||||
|
|
||||||
def _on_diag_done(self, result) -> None:
|
def _on_diag_done(self, result) -> None:
|
||||||
self._banner.hide()
|
self._banner.hide()
|
||||||
|
self._crash_banner.hide()
|
||||||
self._finish_btn.setEnabled(True)
|
self._finish_btn.setEnabled(True)
|
||||||
self._discard_btn.setEnabled(True)
|
self._discard_btn.setEnabled(True)
|
||||||
|
self._analyze_btn.setEnabled(True)
|
||||||
if result is None:
|
if result is None:
|
||||||
QMessageBox.warning(self, "RigDoctor", "The diagnostic couldn't be analyzed.")
|
QMessageBox.warning(self, "RigDoctor", "The diagnostic couldn't be analyzed.")
|
||||||
return
|
return
|
||||||
@@ -371,6 +395,48 @@ class GamesPage(QWidget):
|
|||||||
reccontrol.stop_background()
|
reccontrol.stop_background()
|
||||||
self._banner.hide()
|
self._banner.hide()
|
||||||
|
|
||||||
|
# --- hard-crash recovery ----------------------------------------------------------
|
||||||
|
|
||||||
|
def _check_crash(self) -> None:
|
||||||
|
from ..core import diagnostic
|
||||||
|
|
||||||
|
info = diagnostic.pending_crash()
|
||||||
|
if info is None:
|
||||||
|
self._crash_banner.hide()
|
||||||
|
return
|
||||||
|
game = info.game or "your last game"
|
||||||
|
extra = " · ⚠ GPU-lost was captured" if info.gpu_lost else ""
|
||||||
|
self._crash_label.setText(
|
||||||
|
f"⚠ Your last diagnostic for {game} ended unexpectedly — likely a hard crash "
|
||||||
|
f"({info.samples} samples{extra}). Analyze it to see the final readings and the "
|
||||||
|
f"likely cause from the system logs."
|
||||||
|
)
|
||||||
|
self._analyze_btn.setEnabled(True)
|
||||||
|
self._crash_banner.show()
|
||||||
|
|
||||||
|
def _analyze_crash(self) -> None:
|
||||||
|
from ..core import diagnostic
|
||||||
|
|
||||||
|
diagnostic.acknowledge_crash() # don't prompt again for this one
|
||||||
|
self._analyze_btn.setEnabled(False)
|
||||||
|
self._crash_label.setText("Analyzing the crash (final readings + system logs)…")
|
||||||
|
threading.Thread(target=self._work_analyze_crash, daemon=True).start()
|
||||||
|
|
||||||
|
def _work_analyze_crash(self) -> None:
|
||||||
|
from ..core import diagnostic
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = diagnostic.analyze_crash()
|
||||||
|
except Exception:
|
||||||
|
result = None
|
||||||
|
self._diag_done.emit(result)
|
||||||
|
|
||||||
|
def _dismiss_crash(self) -> None:
|
||||||
|
from ..core import diagnostic
|
||||||
|
|
||||||
|
diagnostic.acknowledge_crash()
|
||||||
|
self._crash_banner.hide()
|
||||||
|
|
||||||
# --- nav badge integration --------------------------------------------------------
|
# --- nav badge integration --------------------------------------------------------
|
||||||
|
|
||||||
def showEvent(self, event) -> None: # noqa: N802 (Qt override)
|
def showEvent(self, event) -> None: # noqa: N802 (Qt override)
|
||||||
@@ -392,3 +458,5 @@ class GamesPage(QWidget):
|
|||||||
self._banner.show()
|
self._banner.show()
|
||||||
if not self._diag_timer.isActive():
|
if not self._diag_timer.isActive():
|
||||||
self._diag_timer.start()
|
self._diag_timer.start()
|
||||||
|
else:
|
||||||
|
self._check_crash() # re-surface an interrupted diagnostic if one is pending
|
||||||
|
|||||||
@@ -57,5 +57,51 @@ class FinishTests(unittest.TestCase):
|
|||||||
self.assertTrue(any(kind == "gpu-lost" for _ts, kind, _d in result.summary.events))
|
self.assertTrue(any(kind == "gpu-lost" for _ts, kind, _d in result.summary.events))
|
||||||
|
|
||||||
|
|
||||||
|
class CrashDetectionTests(unittest.TestCase):
|
||||||
|
def _diag_log(self, d) -> Path:
|
||||||
|
return Path(d) / "diagnostic.jsonl"
|
||||||
|
|
||||||
|
def test_unterminated_session_is_a_pending_crash(self):
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
log = self._diag_log(d)
|
||||||
|
_write_log(str(log), "Tarkov") # has session-start + game, no session-stop
|
||||||
|
with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \
|
||||||
|
mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None):
|
||||||
|
info = diagnostic.pending_crash()
|
||||||
|
self.assertIsNotNone(info)
|
||||||
|
self.assertEqual(info.game, "Tarkov")
|
||||||
|
self.assertTrue(info.gpu_lost) # _write_log writes a gpu-lost event
|
||||||
|
|
||||||
|
def test_clean_stop_is_not_a_crash(self):
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
log = self._diag_log(d)
|
||||||
|
w = CrashLogWriter(str(log))
|
||||||
|
w.write_event("session-start"); w.write_event("game", "X")
|
||||||
|
w.write_sample(Sample(time.time(), [Reading("gpu", "temp", 60.0, "°C", "")]))
|
||||||
|
w.write_event("session-stop", "samples=1")
|
||||||
|
w.close()
|
||||||
|
with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \
|
||||||
|
mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None):
|
||||||
|
self.assertIsNone(diagnostic.pending_crash())
|
||||||
|
|
||||||
|
def test_acknowledge_clears_pending_crash(self):
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
log = self._diag_log(d)
|
||||||
|
_write_log(str(log), "Tarkov")
|
||||||
|
with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \
|
||||||
|
mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None):
|
||||||
|
self.assertIsNotNone(diagnostic.pending_crash())
|
||||||
|
diagnostic.acknowledge_crash()
|
||||||
|
self.assertIsNone(diagnostic.pending_crash())
|
||||||
|
|
||||||
|
def test_running_capture_is_not_a_crash(self):
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
log = self._diag_log(d)
|
||||||
|
_write_log(str(log), "Tarkov")
|
||||||
|
with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \
|
||||||
|
mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=4321):
|
||||||
|
self.assertIsNone(diagnostic.pending_crash()) # it's in-progress, not crashed
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user