feat: guided diagnostic session (CLI) — pick a game, capture, analyze — 0.11.0

The seed use case end to end, orchestrating M3 + M4 (ARCHITECTURE §7.1). - core/diagnostic.py: start(game) runs a focused, game-tagged capture into a dedicated diagnostic log (window-scoped report, separate from the always-on crash log); finish() stops it and combines the capture summary (M3) with the health findings (M4). Game recorded as a log event so it survives crash+reboot. - CLI: rigdoctor diagnose start --game/--appid | status | finish. - recorder/record run gained an optional --game tag; reccontrol passes it through. - Tests for game recovery + the finish() combination. GUI/tray "Run Diagnostic" button and auto start/stop (D12 wrapper) come next. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 08:27:53 +02:00
parent 8b1083a29b
commit 5682878f22
10 changed files with 263 additions and 5 deletions
@@ -5,6 +5,18 @@ All notable changes to RigDoctor are recorded here. Format follows
 (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
 release tag (so the auto-updater, D18, can compare versions).

+## [0.11.0] - 2026-05-22
+### Added
+- **Guided diagnostic session (CLI) — the seed use case, end to end.** `rigdoctor diagnose
+  start --game "<name>"` runs a **focused crash-capture tagged with that game** (its own
+  diagnostic log, so the report is scoped to just that session), `diagnose status` shows
+  progress, and `diagnose finish` stops it and prints a combined report: the **capture
+  summary** (peak temps/power, GPU-lost events, last samples — M3) plus the **health findings**
+  (Xid/SMART/driver/etc. — M4). The game can be given by `--game` or `--appid` (resolved from
+  the Steam scan), and is recorded as a log event so it survives a crash + reboot.
+- Shared orchestration lives in `core/diagnostic.py` (one callable for CLI/GUI/tray, per
+  ARCHITECTURE §7.1); the recorder/`record run` gained an optional `--game` tag.
+
 ## [0.10.2] - 2026-05-22
 ### Changed
 - When an Environment **Apply**/**Install** fails, the status now shows the **real reason**
@@ -40,8 +40,11 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`).
 - [ ] M10 desktop GUI (PySide6: dashboard, log browser, report viewer, logger controls)
 - [ ] M11 tray / menu-bar applet (QSystemTrayIcon: live M1 readouts + Run Diagnostic +
      supporting actions — D13)
- [ ] Guided diagnostic session (pick game → focused M3 capture → M4 scan → findings),
-      shared by tray/GUI/CLI
+- [~] Guided diagnostic session (pick game → focused M3 capture → M4 scan → findings),
+      shared by tray/GUI/CLI — *core + CLI done* (`core/diagnostic.py`, `rigdoctor diagnose
+      start/status/finish`): tags a focused capture with the chosen game (own diagnostic log,
+      window-scoped report) and combines the capture summary with the M4 findings. *Pending:*
+      the GUI/tray "Run Diagnostic" button, and auto start/stop via the D12 wrapper/watcher.
 - [ ] Logger trigger modes: always-on + game-launch (D12 — wrapper first:
      `rigdoctor wrap %command%` + global Steam compat-tool; zero-config watcher
      (Steam RunningAppID + /proc) and GameMode hook follow)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "rigdoctor"
-version = "0.10.2"
+version = "0.11.0"
 description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
 readme = "README.md"
 requires-python = ">=3.11"
@@ -1,3 +1,3 @@
 """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""

-__version__ = "0.10.2"
+__version__ = "0.11.0"
@@ -86,6 +86,7 @@ def cmd_record_run(args) -> int:
        max_bytes=cfg["log_max_bytes"],
        backups=cfg["log_backups"],
        status_path=config.STATUS_FILE,
+        game=getattr(args, "game", None),
    )

    def _handle(_sig, _frame):
@@ -345,6 +346,77 @@ def cmd_report(args) -> int:
    return 0


+def _resolve_game(args) -> str | None:
+    """Game name from --game, or looked up from --appid via the Steam scan."""
+    if getattr(args, "game", None):
+        return args.game
+    if getattr(args, "appid", None):
+        from .core import steam
+
+        for g in steam.scan_games(steam.selected_library_paths()):
+            if g.appid == str(args.appid):
+                return g.name
+        return None
+    return None
+
+
+def cmd_diagnose(args) -> int:
+    from .core import diagnostic, reccontrol, steam
+
+    sub = args.diagnose_cmd or "status"
+
+    if sub == "start":
+        if reccontrol.running_pid():
+            print("A capture is already running — finish it with: rigdoctor diagnose finish")
+            return 1
+        game = _resolve_game(args)
+        if game is None and (args.game or args.appid):
+            print("Couldn't match that game in your selected Steam libraries.")
+            return 1
+        if game is None:
+            games = steam.cached_games() or steam.scan_games(steam.selected_library_paths())
+            if games:
+                print("Pick a game to focus on, then re-run with --game:")
+                for g in games:
+                    print(f"  --game {g.name!r}")
+            else:
+                print("No games detected. Select a library: rigdoctor games libraries --all")
+            return 1
+        pid = diagnostic.start(game=game, interval=args.interval)
+        time.sleep(1.0)
+        if pid and reccontrol.pid_alive(pid):
+            print(f"Diagnostic capture started for {game!r} (pid {pid}).")
+            print("  Play your game. When you're done (or after a crash + reboot):")
+            print("    rigdoctor diagnose finish")
+            return 0
+        print(f"Capture failed to start; see {config.SPAWN_LOG}")
+        return 1
+
+    if sub == "status":
+        status = diagnostic.active()
+        if not status:
+            print("No diagnostic capture is running.")
+            return 0
+        game = status.get("game") or "—"
+        print(f"Capturing for {game!r}: {status.get('samples', 0)} samples"
+              + (" · GPU-lost seen" if status.get("gpu_lost") else ""))
+        return 0
+
+    # finish
+    if not reccontrol.running_pid() and not config.DIAG_LOG.exists():
+        print("No diagnostic to analyze. Start one with: rigdoctor diagnose start --game <name>")
+        return 1
+    print("Stopping capture and analyzing…\n")
+    result = diagnostic.finish(last_n=args.last)
+    from .render import render_health, render_summary
+
+    if result.game:
+        print(f"Diagnostic — {result.game}\n")
+    print(render_summary(result.summary, log_path=config.DIAG_LOG))
+    print("\n" + render_health(result.findings, title="Findings"))
+    return 0
+
+
 def cmd_gameenv(args) -> int:
    from dataclasses import asdict

@@ -470,6 +542,7 @@ def build_parser() -> argparse.ArgumentParser:
    run_p = rec_sub.add_parser("run", help="run the capture loop in the foreground (systemd-friendly)")
    run_p.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)")
    run_p.add_argument("-o", "--out", default=None, help="log file path")
+    run_p.add_argument("--game", default=None, help="tag the capture with a game name (M6/diagnose)")
    run_p.set_defaults(func=cmd_record_run)

    start_p = rec_sub.add_parser("start", help="start recording in the background")
@@ -519,6 +592,19 @@ def build_parser() -> argparse.ArgumentParser:
    env_p = sub.add_parser("gameenv", help="gaming environment checks (M6): flag stability/perf settings")
    env_p.add_argument("--json", action="store_true", help="output JSON instead of text")
    env_p.set_defaults(func=cmd_gameenv)
+
+    diag_p = sub.add_parser("diagnose", help="guided diagnostic: capture while gaming, then analyze")
+    diag_sub = diag_p.add_subparsers(dest="diagnose_cmd")
+    diag_start = diag_sub.add_parser("start", help="start a focused capture for a game")
+    diag_start.add_argument("--game", default=None, help="game name to focus on")
+    diag_start.add_argument("--appid", default=None, help="Steam appid to focus on (resolved to a name)")
+    diag_start.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)")
+    diag_start.set_defaults(func=cmd_diagnose)
+    diag_sub.add_parser("status", help="show the in-progress diagnostic").set_defaults(func=cmd_diagnose)
+    diag_finish = diag_sub.add_parser("finish", help="stop the capture and analyze it")
+    diag_finish.add_argument("--last", type=int, default=10, help="recent samples to show")
+    diag_finish.set_defaults(func=cmd_diagnose)
+    diag_p.set_defaults(func=cmd_diagnose, diagnose_cmd=None, last=10)
    return p


@@ -23,6 +23,9 @@ CONFIG_FILE = CONFIG_DIR / "config.toml"

 # Crash-capture logger (M3)
 LOG_FILE = LOG_DIR / "capture.jsonl"
+# Guided diagnostic (M6/D12): a focused capture writes here, separate from the always-on
+# crash log, so its report covers only that session's window.
+DIAG_LOG = LOG_DIR / "diagnostic.jsonl"
 STATUS_FILE = STATE_DIR / "recorder.json"
 PID_FILE = STATE_DIR / "recorder.pid"
 SPAWN_LOG = STATE_DIR / "recorder.out"
@@ -0,0 +1,84 @@
+"""Guided diagnostic session (SPEC §4 / ARCHITECTURE §7.1): orchestrate M3 + M4.
+
+The seed use case, one flow: **pick a game** → **focused crash-capture** scoped to that
+session (M3, tagged with the game) → on **finish**, **scan & analyze** (M4 health report)
+over the captured window + system logs → return a prioritized result. This is not a new
+module — it's a single shared callable so the CLI, GUI, and tray run the identical flow.
+
+The capture is **manually bracketed** (start/finish) for now; auto start/stop on game launch
+(the D12 wrapper/watcher) plugs in here later without changing the result shape.
+"""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+
+from .. import config
+from . import reccontrol
+from .crashlog import Summary, summarize
+from .health import Finding
+
+
+@dataclass
+class DiagnosticResult:
+    game: str | None
+    summary: Summary           # capture window: peak temps/power, events, last samples (M3)
+    findings: list[Finding]    # health findings: Xid/SMART/driver/etc. (M4)
+
+
+def _clear_diag_log() -> None:
+    """Each diagnostic is a fresh focused capture — drop any previous session + segments."""
+    base = config.DIAG_LOG
+    for p in [base, *base.parent.glob(base.name + ".*")]:
+        try:
+            p.unlink()
+        except OSError:
+            pass
+
+
+def start(game: str | None = None, interval: float | None = None) -> int | None:
+    """Begin a focused capture, tagged with the game, into the dedicated diagnostic log.
+    Returns the pid, or None if a capture is already running."""
+    if reccontrol.running_pid():
+        return None
+    _clear_diag_log()
+    return reccontrol.start_background(interval=interval, out=str(config.DIAG_LOG), game=game)
+
+
+def is_running() -> bool:
+    return reccontrol.running_pid() is not None
+
+
+def active() -> dict | None:
+    """Status of the in-progress session (running flag, game, samples), or None if idle."""
+    if not is_running():
+        return None
+    return reccontrol.read_status()
+
+
+def _await_stopped(timeout: float = 6.0) -> None:
+    deadline = time.monotonic() + timeout
+    while reccontrol.running_pid() and time.monotonic() < deadline:
+        time.sleep(0.1)
+
+
+def _game_from_summary(summary: Summary) -> str | None:
+    """Recover the focused game from the log's 'game' event (survives a crash + reboot)."""
+    for _ts, kind, detail in reversed(summary.events):
+        if kind == "game" and detail:
+            return detail
+    return None
+
+
+def finish(last_n: int = 10, log_path=None) -> DiagnosticResult:
+    """Stop the capture (if running), summarize the window, and run the health report."""
+    from .health import run_health_checks
+
+    reccontrol.stop_background()
+    _await_stopped()
+    path = log_path or config.DIAG_LOG
+    summary = summarize(path, last_n=last_n)
+    game = _game_from_summary(summary) or (reccontrol.read_status() or {}).get("game")
+    findings = run_health_checks()
+    return DiagnosticResult(game=game, summary=summary, findings=findings)
@@ -38,7 +38,9 @@ def read_status() -> dict | None:
        return None


-def start_background(interval: float | None = None, out: str | None = None) -> int | None:
+def start_background(
+    interval: float | None = None, out: str | None = None, game: str | None = None
+) -> int | None:
    """Spawn a detached `record run`. Returns the child pid, or None if already running."""
    if running_pid():
        return None
@@ -48,6 +50,8 @@ def start_background(interval: float | None = None, out: str | None = None) -> i
        cmd += ["--interval", str(interval)]
    if out:
        cmd += ["--out", out]
+    if game:
+        cmd += ["--game", game]
    out_fh = open(config.SPAWN_LOG, "a")
    proc = subprocess.Popen(
        cmd,
@@ -27,12 +27,14 @@ class Recorder:
        backups: int = 10,
        status_path=None,
        sampler: Sampler | None = None,
+        game: str | None = None,
    ) -> None:
        self.interval = interval
        self.sampler = sampler or Sampler(available_sources())
        self.writer = CrashLogWriter(log_path, max_bytes, backups)
        self.log_path = Path(log_path)
        self.status_path = Path(status_path) if status_path else None
+        self.game = game or None
        self.samples = 0
        self._stop = threading.Event()
        self._gpu_lost = False
@@ -43,6 +45,8 @@ class Recorder:

    def run(self) -> None:
        self.writer.write_event("session-start", f"interval={self.interval:g}s")
+        if self.game:
+            self.writer.write_event("game", self.game)  # tag the focused-diagnostic target
        self._write_status(running=True)
        try:
            while not self._stop.is_set():
@@ -81,6 +85,7 @@ class Recorder:
            "samples": self.samples,
            "updated": time.time(),
            "gpu_lost": self._gpu_lost,
+            "game": self.game,
        }
        if sample is not None:
            data["latest"] = headline(sample)
@@ -0,0 +1,61 @@
+"""Tests for the guided diagnostic orchestration (M3+M4 glue)."""
+
+import tempfile
+import time
+import unittest
+from pathlib import Path
+from unittest import mock
+
+from rigdoctor.core import diagnostic
+from rigdoctor.core.crashlog import CrashLogWriter, summarize
+from rigdoctor.core.health import Finding
+from rigdoctor.core.sample import Reading, Sample
+
+
+def _write_log(path: str, game: str) -> None:
+    w = CrashLogWriter(path)
+    w.write_event("session-start", "interval=1s")
+    w.write_event("game", game)
+    for temp in (60.0, 72.0, 81.0):
+        w.write_sample(Sample(ts=time.time(), readings=[Reading("gpu", "temp", temp, "°C", "")]))
+    w.write_event("gpu-lost", "nvidia-smi query timed out")
+    w.close()
+
+
+class GameRecoveryTests(unittest.TestCase):
+    def test_game_recovered_from_log_event(self):
+        with tempfile.TemporaryDirectory() as d:
+            log = str(Path(d) / "capture.jsonl")
+            _write_log(log, "Path of Exile 2")
+            summary = summarize(log)
+            self.assertEqual(diagnostic._game_from_summary(summary), "Path of Exile 2")
+
+    def test_no_game_event_returns_none(self):
+        with tempfile.TemporaryDirectory() as d:
+            log = str(Path(d) / "capture.jsonl")
+            w = CrashLogWriter(log)
+            w.write_event("session-start")
+            w.close()
+            self.assertIsNone(diagnostic._game_from_summary(summarize(log)))
+
+
+class FinishTests(unittest.TestCase):
+    def test_finish_combines_summary_and_findings(self):
+        with tempfile.TemporaryDirectory() as d:
+            log = Path(d) / "capture.jsonl"
+            _write_log(str(log), "Satisfactory")
+            fake = [Finding("warning", "GPU", "NVIDIA Xid 79 ×1", "fell off the bus")]
+            with mock.patch("rigdoctor.core.health.run_health_checks", return_value=fake), \
+                 mock.patch.object(diagnostic.reccontrol, "stop_background", return_value=False), \
+                 mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None):
+                result = diagnostic.finish(log_path=log)
+            self.assertEqual(result.game, "Satisfactory")
+            self.assertEqual(result.summary.samples, 3)
+            self.assertEqual(result.findings, fake)
+            # peak GPU temp captured in the window, GPU-lost event recorded
+            self.assertEqual(result.summary.maxima["gpu.temp"][0], 81.0)
+            self.assertTrue(any(kind == "gpu-lost" for _ts, kind, _d in result.summary.events))
+
+
+if __name__ == "__main__":
+    unittest.main()