diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d1c280..f11c554 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ All notable changes to RigDoctor are recorded here. Format follows (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git release tag (so the auto-updater, D18, can compare versions). +## [0.15.0] - 2026-05-22 +### Added +- **Hard-crash detection & recovery for the guided diagnostic.** If a focused capture ends + without a clean stop (the recorder never wrote `session-stop` and isn't running), RigDoctor + treats it as a likely hard freeze. On launch the **Games** page shows a warning banner — + *"Your last diagnostic for ended unexpectedly…"* — with **Analyze crash** / **Dismiss**. +- **Deeper crash analysis.** *Analyze crash* combines the captured window (final readings before + the freeze + any GPU-lost event) with a focused scan of the **previous (crashed) boot's kernel + log** (`journalctl -k -b -1`: Xid/panic/OOM/MCE/AER/thermal) plus SMART/driver/persistence/ + live-temp checks — the full "what happened" picture. `core/diagnostic.py` gains + `pending_crash()` / `analyze_crash()`; `health.check_previous_boot()` + + `run_health_checks(include_journal=False)` back it. + ## [0.14.0] - 2026-05-22 ### Changed - **Dashboard headline tiles are now history trend graphs** instead of single-value gauges — diff --git a/pyproject.toml b/pyproject.toml index 4cc73ef..edab750 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "rigdoctor" -version = "0.14.0" +version = "0.15.0" description = "Modular hardware monitoring & crash diagnostics for Linux gamers." readme = "README.md" requires-python = ">=3.11" diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py index 824b7ac..8370be9 100644 --- a/src/rigdoctor/__init__.py +++ b/src/rigdoctor/__init__.py @@ -1,3 +1,3 @@ """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers.""" -__version__ = "0.14.0" +__version__ = "0.15.0" diff --git a/src/rigdoctor/core/diagnostic.py b/src/rigdoctor/core/diagnostic.py index c4455a8..af0809f 100644 --- a/src/rigdoctor/core/diagnostic.py +++ b/src/rigdoctor/core/diagnostic.py @@ -11,13 +11,16 @@ The capture is **manually bracketed** (start/finish) for now; auto start/stop on from __future__ import annotations +import json import time from dataclasses import dataclass from .. import config from . import reccontrol from .crashlog import Summary, summarize -from .health import Finding +from .health import CRITICAL, OK, WARNING, Finding + +_SEV_ORDER = {CRITICAL: 0, WARNING: 1, "info": 2, OK: 3} @dataclass @@ -27,6 +30,14 @@ class DiagnosticResult: findings: list[Finding] # health findings: Xid/SMART/driver/etc. (M4) +@dataclass +class CrashInfo: + game: str | None + samples: int + when: float | None # ts of the last captured sample (≈ when the freeze hit) + gpu_lost: bool + + def _clear_diag_log() -> None: """Each diagnostic is a fresh focused capture — drop any previous session + segments.""" base = config.DIAG_LOG @@ -82,3 +93,70 @@ def finish(last_n: int = 10, log_path=None) -> DiagnosticResult: game = _game_from_summary(summary) or (reccontrol.read_status() or {}).get("game") findings = run_health_checks() return DiagnosticResult(game=game, summary=summary, findings=findings) + + +# --- hard-crash detection & post-crash analysis ----------------------------------- + +def pending_crash() -> CrashInfo | None: + """Detect a diagnostic that ended abnormally (no clean stop, no live recorder). + + A focused capture writes `session-start` (+ `game`) and, on a clean stop, `session-stop`. + After a hard freeze that block never runs, so the log has a start with no stop and no + live recorder — that's our hard-crash signal. Returns None if a capture is running, none + is recorded, it stopped cleanly, or the user already acknowledged it. + """ + if is_running() or not config.DIAG_LOG.exists(): + return None + summary = summarize(config.DIAG_LOG) + kinds = {kind for _ts, kind, _detail in summary.events} + if "session-start" not in kinds: + return None + if "session-stop" in kinds or "diagnostic-acknowledged" in kinds: + return None + return CrashInfo( + game=_game_from_summary(summary), + samples=summary.samples, + when=summary.end, + gpu_lost="gpu-lost" in kinds, + ) + + +def acknowledge_crash() -> None: + """Mark the recorded crash as seen so it stops prompting (appends a marker event).""" + try: + config.DIAG_LOG.parent.mkdir(parents=True, exist_ok=True) + with open(config.DIAG_LOG, "a", encoding="utf-8") as fh: + fh.write(json.dumps({"ts": time.time(), "event": "diagnostic-acknowledged", "detail": ""}) + "\n") + except OSError: + pass + + +def _crash_headline(summary: Summary) -> Finding: + gpu_lost = any(kind == "gpu-lost" for _ts, kind, _detail in summary.events) + when = time.strftime("%H:%M:%S", time.localtime(summary.end)) if summary.end else "?" + detail = ( + f"The capture stopped abruptly at {when} after {summary.samples} samples, with no clean " + "shutdown recorded — consistent with a hard freeze or power loss." + ) + if gpu_lost: + detail += " A GPU-lost event was captured during the session." + return Finding( + CRITICAL if gpu_lost else WARNING, + "Diagnostic", + "Session ended without a clean stop (likely a hard crash)", + detail, + "Review the last readings (Capture, above) and the crash-boot findings below.", + ) + + +def analyze_crash(last_n: int = 15) -> DiagnosticResult: + """Analyze a recorded hard crash: the captured window + the previous boot's kernel log + + the rest of the health report (SMART/driver/persistence/temps).""" + from .health import check_previous_boot, run_health_checks + + summary = summarize(config.DIAG_LOG, last_n=last_n) + findings: list[Finding] = [_crash_headline(summary)] + findings += check_previous_boot() # the crashed boot's kernel log + findings += run_health_checks(include_journal=False) # SMART/driver/persistence/temps + findings.sort(key=lambda f: _SEV_ORDER.get(f.severity, 9)) + return DiagnosticResult(game=_game_from_summary(summary), summary=summary, findings=findings) diff --git a/src/rigdoctor/core/health.py b/src/rigdoctor/core/health.py index 5fdf3aa..3ce1907 100644 --- a/src/rigdoctor/core/health.py +++ b/src/rigdoctor/core/health.py @@ -146,6 +146,22 @@ def check_journal() -> list[Finding]: return findings +def check_previous_boot() -> list[Finding]: + """Scan the previous boot's kernel log — the boot that crashed — for fault signatures. + + Needs persistent journald (else the crashed boot's logs were lost on reboot, which the + persistence check flags separately). Findings are framed as coming from that boot. + """ + out = _journalctl(["-k", "-b", "-1", "--no-pager", "-o", "cat"]) + if not out or not out.strip(): + return [] + tagged = [] + for f in scan_journal_text(out): + detail = ("Logged during the previous (crashed) boot. " + (f.detail or "")).strip() + tagged.append(Finding(f.severity, f.category, f.title, detail, f.suggestion)) + return tagged + + def check_journal_persistence() -> list[Finding]: if Path("/var/log/journal").is_dir(): return [] @@ -235,17 +251,21 @@ def check_live_temps() -> list[Finding]: )] -def run_health_checks() -> list[Finding]: +def run_health_checks(include_journal: bool = True) -> list[Finding]: """Run all checks and return findings sorted by severity (worst first). SMART needs root; if the session collected it via launch elevation, use that instead of re-running smartctl (which would just report "needs root"). + + `include_journal=False` skips the 7-day kernel-journal scan — used by the crash + analysis, which scans the previous (crashed) boot specifically instead. """ from . import elevation findings: list[Finding] = [] findings += check_nvidia_driver() - findings += check_journal() + if include_journal: + findings += check_journal() findings += check_journal_persistence() priv = elevation.privileged() if priv is not None and priv.get("smart") is not None: diff --git a/src/rigdoctor/gui/games_page.py b/src/rigdoctor/gui/games_page.py index 62e87a9..3a51dab 100644 --- a/src/rigdoctor/gui/games_page.py +++ b/src/rigdoctor/gui/games_page.py @@ -26,7 +26,7 @@ from PySide6.QtWidgets import ( from ..config import load_config, update_config from .diagnostic_dialog import DiagnosticDialog -from .theme import ACCENT, GOOD, MUTED +from .theme import ACCENT, GOOD, MUTED, WARN def _game_row(name: str, sublabel: str, size: str, is_new: bool, appid: str = "", on_diagnose=None) -> QFrame: @@ -126,6 +126,27 @@ class GamesPage(QWidget): self._banner.hide() root.addWidget(self._banner) + # Hard-crash banner: a previous diagnostic ended without a clean stop. + self._crash_banner = QFrame() + self._crash_banner.setObjectName("Card") + self._crash_banner.setStyleSheet(f"#Card {{ border: 1px solid {WARN}; }}") + crash_h = QHBoxLayout(self._crash_banner) + crash_h.setContentsMargins(16, 10, 16, 10) + crash_h.setSpacing(10) + self._crash_label = QLabel("") + self._crash_label.setWordWrap(True) + self._crash_label.setStyleSheet(f"color: {WARN}; font-weight: 700; background: transparent;") + crash_h.addWidget(self._crash_label, 1) + self._analyze_btn = QPushButton("Analyze crash") + self._analyze_btn.setObjectName("ActionButton") + self._analyze_btn.clicked.connect(self._analyze_crash) + crash_h.addWidget(self._analyze_btn) + self._dismiss_btn = QPushButton("Dismiss") + self._dismiss_btn.clicked.connect(self._dismiss_crash) + crash_h.addWidget(self._dismiss_btn) + self._crash_banner.hide() + root.addWidget(self._crash_banner) + self._diag_timer = QTimer(self) self._diag_timer.setInterval(1000) self._diag_timer.timeout.connect(self._poll_diag) @@ -163,6 +184,7 @@ class GamesPage(QWidget): self._load_cached() # instant display from the last scan QTimer.singleShot(400, self.refresh) # then rescan in the background on launch + self._check_crash() # surface an interrupted (crashed) diagnostic # --- loading ---------------------------------------------------------------------- @@ -357,8 +379,10 @@ class GamesPage(QWidget): def _on_diag_done(self, result) -> None: self._banner.hide() + self._crash_banner.hide() self._finish_btn.setEnabled(True) self._discard_btn.setEnabled(True) + self._analyze_btn.setEnabled(True) if result is None: QMessageBox.warning(self, "RigDoctor", "The diagnostic couldn't be analyzed.") return @@ -371,6 +395,48 @@ class GamesPage(QWidget): reccontrol.stop_background() self._banner.hide() + # --- hard-crash recovery ---------------------------------------------------------- + + def _check_crash(self) -> None: + from ..core import diagnostic + + info = diagnostic.pending_crash() + if info is None: + self._crash_banner.hide() + return + game = info.game or "your last game" + extra = " · ⚠ GPU-lost was captured" if info.gpu_lost else "" + self._crash_label.setText( + f"⚠ Your last diagnostic for {game} ended unexpectedly — likely a hard crash " + f"({info.samples} samples{extra}). Analyze it to see the final readings and the " + f"likely cause from the system logs." + ) + self._analyze_btn.setEnabled(True) + self._crash_banner.show() + + def _analyze_crash(self) -> None: + from ..core import diagnostic + + diagnostic.acknowledge_crash() # don't prompt again for this one + self._analyze_btn.setEnabled(False) + self._crash_label.setText("Analyzing the crash (final readings + system logs)…") + threading.Thread(target=self._work_analyze_crash, daemon=True).start() + + def _work_analyze_crash(self) -> None: + from ..core import diagnostic + + try: + result = diagnostic.analyze_crash() + except Exception: + result = None + self._diag_done.emit(result) + + def _dismiss_crash(self) -> None: + from ..core import diagnostic + + diagnostic.acknowledge_crash() + self._crash_banner.hide() + # --- nav badge integration -------------------------------------------------------- def showEvent(self, event) -> None: # noqa: N802 (Qt override) @@ -392,3 +458,5 @@ class GamesPage(QWidget): self._banner.show() if not self._diag_timer.isActive(): self._diag_timer.start() + else: + self._check_crash() # re-surface an interrupted diagnostic if one is pending diff --git a/tests/test_diagnostic.py b/tests/test_diagnostic.py index 46ec89e..ff1d40a 100644 --- a/tests/test_diagnostic.py +++ b/tests/test_diagnostic.py @@ -57,5 +57,51 @@ class FinishTests(unittest.TestCase): self.assertTrue(any(kind == "gpu-lost" for _ts, kind, _d in result.summary.events)) +class CrashDetectionTests(unittest.TestCase): + def _diag_log(self, d) -> Path: + return Path(d) / "diagnostic.jsonl" + + def test_unterminated_session_is_a_pending_crash(self): + with tempfile.TemporaryDirectory() as d: + log = self._diag_log(d) + _write_log(str(log), "Tarkov") # has session-start + game, no session-stop + with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \ + mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None): + info = diagnostic.pending_crash() + self.assertIsNotNone(info) + self.assertEqual(info.game, "Tarkov") + self.assertTrue(info.gpu_lost) # _write_log writes a gpu-lost event + + def test_clean_stop_is_not_a_crash(self): + with tempfile.TemporaryDirectory() as d: + log = self._diag_log(d) + w = CrashLogWriter(str(log)) + w.write_event("session-start"); w.write_event("game", "X") + w.write_sample(Sample(time.time(), [Reading("gpu", "temp", 60.0, "°C", "")])) + w.write_event("session-stop", "samples=1") + w.close() + with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \ + mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None): + self.assertIsNone(diagnostic.pending_crash()) + + def test_acknowledge_clears_pending_crash(self): + with tempfile.TemporaryDirectory() as d: + log = self._diag_log(d) + _write_log(str(log), "Tarkov") + with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \ + mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None): + self.assertIsNotNone(diagnostic.pending_crash()) + diagnostic.acknowledge_crash() + self.assertIsNone(diagnostic.pending_crash()) + + def test_running_capture_is_not_a_crash(self): + with tempfile.TemporaryDirectory() as d: + log = self._diag_log(d) + _write_log(str(log), "Tarkov") + with mock.patch.object(diagnostic.config, "DIAG_LOG", log), \ + mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=4321): + self.assertIsNone(diagnostic.pending_crash()) # it's in-progress, not crashed + + if __name__ == "__main__": unittest.main()