feat: guided diagnostic session (CLI) — pick a game, capture, analyze — 0.11.0

The seed use case end to end, orchestrating M3 + M4 (ARCHITECTURE §7.1).

- core/diagnostic.py: start(game) runs a focused, game-tagged capture into a
  dedicated diagnostic log (window-scoped report, separate from the always-on
  crash log); finish() stops it and combines the capture summary (M3) with the
  health findings (M4). Game recorded as a log event so it survives crash+reboot.
- CLI: rigdoctor diagnose start --game/--appid | status | finish.
- recorder/record run gained an optional --game tag; reccontrol passes it through.
- Tests for game recovery + the finish() combination.

GUI/tray "Run Diagnostic" button and auto start/stop (D12 wrapper) come next.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 08:27:53 +02:00
parent 8b1083a29b
commit 5682878f22
10 changed files with 263 additions and 5 deletions
+12
View File
@@ -5,6 +5,18 @@ All notable changes to RigDoctor are recorded here. Format follows
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
release tag (so the auto-updater, D18, can compare versions).
## [0.11.0] - 2026-05-22
### Added
- **Guided diagnostic session (CLI) — the seed use case, end to end.** `rigdoctor diagnose
start --game "<name>"` runs a **focused crash-capture tagged with that game** (its own
diagnostic log, so the report is scoped to just that session), `diagnose status` shows
progress, and `diagnose finish` stops it and prints a combined report: the **capture
summary** (peak temps/power, GPU-lost events, last samples — M3) plus the **health findings**
(Xid/SMART/driver/etc. — M4). The game can be given by `--game` or `--appid` (resolved from
the Steam scan), and is recorded as a log event so it survives a crash + reboot.
- Shared orchestration lives in `core/diagnostic.py` (one callable for CLI/GUI/tray, per
ARCHITECTURE §7.1); the recorder/`record run` gained an optional `--game` tag.
## [0.10.2] - 2026-05-22
### Changed
- When an Environment **Apply**/**Install** fails, the status now shows the **real reason**
+5 -2
View File
@@ -40,8 +40,11 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`).
- [ ] M10 desktop GUI (PySide6: dashboard, log browser, report viewer, logger controls)
- [ ] M11 tray / menu-bar applet (QSystemTrayIcon: live M1 readouts + Run Diagnostic +
supporting actions — D13)
- [ ] Guided diagnostic session (pick game → focused M3 capture → M4 scan → findings),
shared by tray/GUI/CLI
- [~] Guided diagnostic session (pick game → focused M3 capture → M4 scan → findings),
shared by tray/GUI/CLI*core + CLI done* (`core/diagnostic.py`, `rigdoctor diagnose
start/status/finish`): tags a focused capture with the chosen game (own diagnostic log,
window-scoped report) and combines the capture summary with the M4 findings. *Pending:*
the GUI/tray "Run Diagnostic" button, and auto start/stop via the D12 wrapper/watcher.
- [ ] Logger trigger modes: always-on + game-launch (D12 — wrapper first:
`rigdoctor wrap %command%` + global Steam compat-tool; zero-config watcher
(Steam RunningAppID + /proc) and GameMode hook follow)
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "rigdoctor"
version = "0.10.2"
version = "0.11.0"
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
readme = "README.md"
requires-python = ">=3.11"
+1 -1
View File
@@ -1,3 +1,3 @@
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
__version__ = "0.10.2"
__version__ = "0.11.0"
+86
View File
@@ -86,6 +86,7 @@ def cmd_record_run(args) -> int:
max_bytes=cfg["log_max_bytes"],
backups=cfg["log_backups"],
status_path=config.STATUS_FILE,
game=getattr(args, "game", None),
)
def _handle(_sig, _frame):
@@ -345,6 +346,77 @@ def cmd_report(args) -> int:
return 0
def _resolve_game(args) -> str | None:
"""Game name from --game, or looked up from --appid via the Steam scan."""
if getattr(args, "game", None):
return args.game
if getattr(args, "appid", None):
from .core import steam
for g in steam.scan_games(steam.selected_library_paths()):
if g.appid == str(args.appid):
return g.name
return None
return None
def cmd_diagnose(args) -> int:
from .core import diagnostic, reccontrol, steam
sub = args.diagnose_cmd or "status"
if sub == "start":
if reccontrol.running_pid():
print("A capture is already running — finish it with: rigdoctor diagnose finish")
return 1
game = _resolve_game(args)
if game is None and (args.game or args.appid):
print("Couldn't match that game in your selected Steam libraries.")
return 1
if game is None:
games = steam.cached_games() or steam.scan_games(steam.selected_library_paths())
if games:
print("Pick a game to focus on, then re-run with --game:")
for g in games:
print(f" --game {g.name!r}")
else:
print("No games detected. Select a library: rigdoctor games libraries --all")
return 1
pid = diagnostic.start(game=game, interval=args.interval)
time.sleep(1.0)
if pid and reccontrol.pid_alive(pid):
print(f"Diagnostic capture started for {game!r} (pid {pid}).")
print(" Play your game. When you're done (or after a crash + reboot):")
print(" rigdoctor diagnose finish")
return 0
print(f"Capture failed to start; see {config.SPAWN_LOG}")
return 1
if sub == "status":
status = diagnostic.active()
if not status:
print("No diagnostic capture is running.")
return 0
game = status.get("game") or ""
print(f"Capturing for {game!r}: {status.get('samples', 0)} samples"
+ (" · GPU-lost seen" if status.get("gpu_lost") else ""))
return 0
# finish
if not reccontrol.running_pid() and not config.DIAG_LOG.exists():
print("No diagnostic to analyze. Start one with: rigdoctor diagnose start --game <name>")
return 1
print("Stopping capture and analyzing…\n")
result = diagnostic.finish(last_n=args.last)
from .render import render_health, render_summary
if result.game:
print(f"Diagnostic — {result.game}\n")
print(render_summary(result.summary, log_path=config.DIAG_LOG))
print("\n" + render_health(result.findings, title="Findings"))
return 0
def cmd_gameenv(args) -> int:
from dataclasses import asdict
@@ -470,6 +542,7 @@ def build_parser() -> argparse.ArgumentParser:
run_p = rec_sub.add_parser("run", help="run the capture loop in the foreground (systemd-friendly)")
run_p.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)")
run_p.add_argument("-o", "--out", default=None, help="log file path")
run_p.add_argument("--game", default=None, help="tag the capture with a game name (M6/diagnose)")
run_p.set_defaults(func=cmd_record_run)
start_p = rec_sub.add_parser("start", help="start recording in the background")
@@ -519,6 +592,19 @@ def build_parser() -> argparse.ArgumentParser:
env_p = sub.add_parser("gameenv", help="gaming environment checks (M6): flag stability/perf settings")
env_p.add_argument("--json", action="store_true", help="output JSON instead of text")
env_p.set_defaults(func=cmd_gameenv)
diag_p = sub.add_parser("diagnose", help="guided diagnostic: capture while gaming, then analyze")
diag_sub = diag_p.add_subparsers(dest="diagnose_cmd")
diag_start = diag_sub.add_parser("start", help="start a focused capture for a game")
diag_start.add_argument("--game", default=None, help="game name to focus on")
diag_start.add_argument("--appid", default=None, help="Steam appid to focus on (resolved to a name)")
diag_start.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)")
diag_start.set_defaults(func=cmd_diagnose)
diag_sub.add_parser("status", help="show the in-progress diagnostic").set_defaults(func=cmd_diagnose)
diag_finish = diag_sub.add_parser("finish", help="stop the capture and analyze it")
diag_finish.add_argument("--last", type=int, default=10, help="recent samples to show")
diag_finish.set_defaults(func=cmd_diagnose)
diag_p.set_defaults(func=cmd_diagnose, diagnose_cmd=None, last=10)
return p
+3
View File
@@ -23,6 +23,9 @@ CONFIG_FILE = CONFIG_DIR / "config.toml"
# Crash-capture logger (M3)
LOG_FILE = LOG_DIR / "capture.jsonl"
# Guided diagnostic (M6/D12): a focused capture writes here, separate from the always-on
# crash log, so its report covers only that session's window.
DIAG_LOG = LOG_DIR / "diagnostic.jsonl"
STATUS_FILE = STATE_DIR / "recorder.json"
PID_FILE = STATE_DIR / "recorder.pid"
SPAWN_LOG = STATE_DIR / "recorder.out"
+84
View File
@@ -0,0 +1,84 @@
"""Guided diagnostic session (SPEC §4 / ARCHITECTURE §7.1): orchestrate M3 + M4.
The seed use case, one flow: **pick a game** → **focused crash-capture** scoped to that
session (M3, tagged with the game) → on **finish**, **scan & analyze** (M4 health report)
over the captured window + system logs → return a prioritized result. This is not a new
module — it's a single shared callable so the CLI, GUI, and tray run the identical flow.
The capture is **manually bracketed** (start/finish) for now; auto start/stop on game launch
(the D12 wrapper/watcher) plugs in here later without changing the result shape.
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from .. import config
from . import reccontrol
from .crashlog import Summary, summarize
from .health import Finding
@dataclass
class DiagnosticResult:
game: str | None
summary: Summary # capture window: peak temps/power, events, last samples (M3)
findings: list[Finding] # health findings: Xid/SMART/driver/etc. (M4)
def _clear_diag_log() -> None:
"""Each diagnostic is a fresh focused capture — drop any previous session + segments."""
base = config.DIAG_LOG
for p in [base, *base.parent.glob(base.name + ".*")]:
try:
p.unlink()
except OSError:
pass
def start(game: str | None = None, interval: float | None = None) -> int | None:
"""Begin a focused capture, tagged with the game, into the dedicated diagnostic log.
Returns the pid, or None if a capture is already running."""
if reccontrol.running_pid():
return None
_clear_diag_log()
return reccontrol.start_background(interval=interval, out=str(config.DIAG_LOG), game=game)
def is_running() -> bool:
return reccontrol.running_pid() is not None
def active() -> dict | None:
"""Status of the in-progress session (running flag, game, samples), or None if idle."""
if not is_running():
return None
return reccontrol.read_status()
def _await_stopped(timeout: float = 6.0) -> None:
deadline = time.monotonic() + timeout
while reccontrol.running_pid() and time.monotonic() < deadline:
time.sleep(0.1)
def _game_from_summary(summary: Summary) -> str | None:
"""Recover the focused game from the log's 'game' event (survives a crash + reboot)."""
for _ts, kind, detail in reversed(summary.events):
if kind == "game" and detail:
return detail
return None
def finish(last_n: int = 10, log_path=None) -> DiagnosticResult:
"""Stop the capture (if running), summarize the window, and run the health report."""
from .health import run_health_checks
reccontrol.stop_background()
_await_stopped()
path = log_path or config.DIAG_LOG
summary = summarize(path, last_n=last_n)
game = _game_from_summary(summary) or (reccontrol.read_status() or {}).get("game")
findings = run_health_checks()
return DiagnosticResult(game=game, summary=summary, findings=findings)
+5 -1
View File
@@ -38,7 +38,9 @@ def read_status() -> dict | None:
return None
def start_background(interval: float | None = None, out: str | None = None) -> int | None:
def start_background(
interval: float | None = None, out: str | None = None, game: str | None = None
) -> int | None:
"""Spawn a detached `record run`. Returns the child pid, or None if already running."""
if running_pid():
return None
@@ -48,6 +50,8 @@ def start_background(interval: float | None = None, out: str | None = None) -> i
cmd += ["--interval", str(interval)]
if out:
cmd += ["--out", out]
if game:
cmd += ["--game", game]
out_fh = open(config.SPAWN_LOG, "a")
proc = subprocess.Popen(
cmd,
+5
View File
@@ -27,12 +27,14 @@ class Recorder:
backups: int = 10,
status_path=None,
sampler: Sampler | None = None,
game: str | None = None,
) -> None:
self.interval = interval
self.sampler = sampler or Sampler(available_sources())
self.writer = CrashLogWriter(log_path, max_bytes, backups)
self.log_path = Path(log_path)
self.status_path = Path(status_path) if status_path else None
self.game = game or None
self.samples = 0
self._stop = threading.Event()
self._gpu_lost = False
@@ -43,6 +45,8 @@ class Recorder:
def run(self) -> None:
self.writer.write_event("session-start", f"interval={self.interval:g}s")
if self.game:
self.writer.write_event("game", self.game) # tag the focused-diagnostic target
self._write_status(running=True)
try:
while not self._stop.is_set():
@@ -81,6 +85,7 @@ class Recorder:
"samples": self.samples,
"updated": time.time(),
"gpu_lost": self._gpu_lost,
"game": self.game,
}
if sample is not None:
data["latest"] = headline(sample)
+61
View File
@@ -0,0 +1,61 @@
"""Tests for the guided diagnostic orchestration (M3+M4 glue)."""
import tempfile
import time
import unittest
from pathlib import Path
from unittest import mock
from rigdoctor.core import diagnostic
from rigdoctor.core.crashlog import CrashLogWriter, summarize
from rigdoctor.core.health import Finding
from rigdoctor.core.sample import Reading, Sample
def _write_log(path: str, game: str) -> None:
w = CrashLogWriter(path)
w.write_event("session-start", "interval=1s")
w.write_event("game", game)
for temp in (60.0, 72.0, 81.0):
w.write_sample(Sample(ts=time.time(), readings=[Reading("gpu", "temp", temp, "°C", "")]))
w.write_event("gpu-lost", "nvidia-smi query timed out")
w.close()
class GameRecoveryTests(unittest.TestCase):
def test_game_recovered_from_log_event(self):
with tempfile.TemporaryDirectory() as d:
log = str(Path(d) / "capture.jsonl")
_write_log(log, "Path of Exile 2")
summary = summarize(log)
self.assertEqual(diagnostic._game_from_summary(summary), "Path of Exile 2")
def test_no_game_event_returns_none(self):
with tempfile.TemporaryDirectory() as d:
log = str(Path(d) / "capture.jsonl")
w = CrashLogWriter(log)
w.write_event("session-start")
w.close()
self.assertIsNone(diagnostic._game_from_summary(summarize(log)))
class FinishTests(unittest.TestCase):
def test_finish_combines_summary_and_findings(self):
with tempfile.TemporaryDirectory() as d:
log = Path(d) / "capture.jsonl"
_write_log(str(log), "Satisfactory")
fake = [Finding("warning", "GPU", "NVIDIA Xid 79 ×1", "fell off the bus")]
with mock.patch("rigdoctor.core.health.run_health_checks", return_value=fake), \
mock.patch.object(diagnostic.reccontrol, "stop_background", return_value=False), \
mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None):
result = diagnostic.finish(log_path=log)
self.assertEqual(result.game, "Satisfactory")
self.assertEqual(result.summary.samples, 3)
self.assertEqual(result.findings, fake)
# peak GPU temp captured in the window, GPU-lost event recorded
self.assertEqual(result.summary.maxima["gpu.temp"][0], 81.0)
self.assertTrue(any(kind == "gpu-lost" for _ts, kind, _d in result.summary.events))
if __name__ == "__main__":
unittest.main()