feat: guided diagnostic session (CLI) — pick a game, capture, analyze — 0.11.0
The seed use case end to end, orchestrating M3 + M4 (ARCHITECTURE §7.1). - core/diagnostic.py: start(game) runs a focused, game-tagged capture into a dedicated diagnostic log (window-scoped report, separate from the always-on crash log); finish() stops it and combines the capture summary (M3) with the health findings (M4). Game recorded as a log event so it survives crash+reboot. - CLI: rigdoctor diagnose start --game/--appid | status | finish. - recorder/record run gained an optional --game tag; reccontrol passes it through. - Tests for game recovery + the finish() combination. GUI/tray "Run Diagnostic" button and auto start/stop (D12 wrapper) come next. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,18 @@ All notable changes to RigDoctor are recorded here. Format follows
|
||||
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
||||
release tag (so the auto-updater, D18, can compare versions).
|
||||
|
||||
## [0.11.0] - 2026-05-22
|
||||
### Added
|
||||
- **Guided diagnostic session (CLI) — the seed use case, end to end.** `rigdoctor diagnose
|
||||
start --game "<name>"` runs a **focused crash-capture tagged with that game** (its own
|
||||
diagnostic log, so the report is scoped to just that session), `diagnose status` shows
|
||||
progress, and `diagnose finish` stops it and prints a combined report: the **capture
|
||||
summary** (peak temps/power, GPU-lost events, last samples — M3) plus the **health findings**
|
||||
(Xid/SMART/driver/etc. — M4). The game can be given by `--game` or `--appid` (resolved from
|
||||
the Steam scan), and is recorded as a log event so it survives a crash + reboot.
|
||||
- Shared orchestration lives in `core/diagnostic.py` (one callable for CLI/GUI/tray, per
|
||||
ARCHITECTURE §7.1); the recorder/`record run` gained an optional `--game` tag.
|
||||
|
||||
## [0.10.2] - 2026-05-22
|
||||
### Changed
|
||||
- When an Environment **Apply**/**Install** fails, the status now shows the **real reason**
|
||||
|
||||
+5
-2
@@ -40,8 +40,11 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`).
|
||||
- [ ] M10 desktop GUI (PySide6: dashboard, log browser, report viewer, logger controls)
|
||||
- [ ] M11 tray / menu-bar applet (QSystemTrayIcon: live M1 readouts + Run Diagnostic +
|
||||
supporting actions — D13)
|
||||
- [ ] Guided diagnostic session (pick game → focused M3 capture → M4 scan → findings),
|
||||
shared by tray/GUI/CLI
|
||||
- [~] Guided diagnostic session (pick game → focused M3 capture → M4 scan → findings),
|
||||
shared by tray/GUI/CLI — *core + CLI done* (`core/diagnostic.py`, `rigdoctor diagnose
|
||||
start/status/finish`): tags a focused capture with the chosen game (own diagnostic log,
|
||||
window-scoped report) and combines the capture summary with the M4 findings. *Pending:*
|
||||
the GUI/tray "Run Diagnostic" button, and auto start/stop via the D12 wrapper/watcher.
|
||||
- [ ] Logger trigger modes: always-on + game-launch (D12 — wrapper first:
|
||||
`rigdoctor wrap %command%` + global Steam compat-tool; zero-config watcher
|
||||
(Steam RunningAppID + /proc) and GameMode hook follow)
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "rigdoctor"
|
||||
version = "0.10.2"
|
||||
version = "0.11.0"
|
||||
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
||||
|
||||
__version__ = "0.10.2"
|
||||
__version__ = "0.11.0"
|
||||
|
||||
@@ -86,6 +86,7 @@ def cmd_record_run(args) -> int:
|
||||
max_bytes=cfg["log_max_bytes"],
|
||||
backups=cfg["log_backups"],
|
||||
status_path=config.STATUS_FILE,
|
||||
game=getattr(args, "game", None),
|
||||
)
|
||||
|
||||
def _handle(_sig, _frame):
|
||||
@@ -345,6 +346,77 @@ def cmd_report(args) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _resolve_game(args) -> str | None:
|
||||
"""Game name from --game, or looked up from --appid via the Steam scan."""
|
||||
if getattr(args, "game", None):
|
||||
return args.game
|
||||
if getattr(args, "appid", None):
|
||||
from .core import steam
|
||||
|
||||
for g in steam.scan_games(steam.selected_library_paths()):
|
||||
if g.appid == str(args.appid):
|
||||
return g.name
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def cmd_diagnose(args) -> int:
|
||||
from .core import diagnostic, reccontrol, steam
|
||||
|
||||
sub = args.diagnose_cmd or "status"
|
||||
|
||||
if sub == "start":
|
||||
if reccontrol.running_pid():
|
||||
print("A capture is already running — finish it with: rigdoctor diagnose finish")
|
||||
return 1
|
||||
game = _resolve_game(args)
|
||||
if game is None and (args.game or args.appid):
|
||||
print("Couldn't match that game in your selected Steam libraries.")
|
||||
return 1
|
||||
if game is None:
|
||||
games = steam.cached_games() or steam.scan_games(steam.selected_library_paths())
|
||||
if games:
|
||||
print("Pick a game to focus on, then re-run with --game:")
|
||||
for g in games:
|
||||
print(f" --game {g.name!r}")
|
||||
else:
|
||||
print("No games detected. Select a library: rigdoctor games libraries --all")
|
||||
return 1
|
||||
pid = diagnostic.start(game=game, interval=args.interval)
|
||||
time.sleep(1.0)
|
||||
if pid and reccontrol.pid_alive(pid):
|
||||
print(f"Diagnostic capture started for {game!r} (pid {pid}).")
|
||||
print(" Play your game. When you're done (or after a crash + reboot):")
|
||||
print(" rigdoctor diagnose finish")
|
||||
return 0
|
||||
print(f"Capture failed to start; see {config.SPAWN_LOG}")
|
||||
return 1
|
||||
|
||||
if sub == "status":
|
||||
status = diagnostic.active()
|
||||
if not status:
|
||||
print("No diagnostic capture is running.")
|
||||
return 0
|
||||
game = status.get("game") or "—"
|
||||
print(f"Capturing for {game!r}: {status.get('samples', 0)} samples"
|
||||
+ (" · GPU-lost seen" if status.get("gpu_lost") else ""))
|
||||
return 0
|
||||
|
||||
# finish
|
||||
if not reccontrol.running_pid() and not config.DIAG_LOG.exists():
|
||||
print("No diagnostic to analyze. Start one with: rigdoctor diagnose start --game <name>")
|
||||
return 1
|
||||
print("Stopping capture and analyzing…\n")
|
||||
result = diagnostic.finish(last_n=args.last)
|
||||
from .render import render_health, render_summary
|
||||
|
||||
if result.game:
|
||||
print(f"Diagnostic — {result.game}\n")
|
||||
print(render_summary(result.summary, log_path=config.DIAG_LOG))
|
||||
print("\n" + render_health(result.findings, title="Findings"))
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_gameenv(args) -> int:
|
||||
from dataclasses import asdict
|
||||
|
||||
@@ -470,6 +542,7 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
run_p = rec_sub.add_parser("run", help="run the capture loop in the foreground (systemd-friendly)")
|
||||
run_p.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)")
|
||||
run_p.add_argument("-o", "--out", default=None, help="log file path")
|
||||
run_p.add_argument("--game", default=None, help="tag the capture with a game name (M6/diagnose)")
|
||||
run_p.set_defaults(func=cmd_record_run)
|
||||
|
||||
start_p = rec_sub.add_parser("start", help="start recording in the background")
|
||||
@@ -519,6 +592,19 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
env_p = sub.add_parser("gameenv", help="gaming environment checks (M6): flag stability/perf settings")
|
||||
env_p.add_argument("--json", action="store_true", help="output JSON instead of text")
|
||||
env_p.set_defaults(func=cmd_gameenv)
|
||||
|
||||
diag_p = sub.add_parser("diagnose", help="guided diagnostic: capture while gaming, then analyze")
|
||||
diag_sub = diag_p.add_subparsers(dest="diagnose_cmd")
|
||||
diag_start = diag_sub.add_parser("start", help="start a focused capture for a game")
|
||||
diag_start.add_argument("--game", default=None, help="game name to focus on")
|
||||
diag_start.add_argument("--appid", default=None, help="Steam appid to focus on (resolved to a name)")
|
||||
diag_start.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)")
|
||||
diag_start.set_defaults(func=cmd_diagnose)
|
||||
diag_sub.add_parser("status", help="show the in-progress diagnostic").set_defaults(func=cmd_diagnose)
|
||||
diag_finish = diag_sub.add_parser("finish", help="stop the capture and analyze it")
|
||||
diag_finish.add_argument("--last", type=int, default=10, help="recent samples to show")
|
||||
diag_finish.set_defaults(func=cmd_diagnose)
|
||||
diag_p.set_defaults(func=cmd_diagnose, diagnose_cmd=None, last=10)
|
||||
return p
|
||||
|
||||
|
||||
|
||||
@@ -23,6 +23,9 @@ CONFIG_FILE = CONFIG_DIR / "config.toml"
|
||||
|
||||
# Crash-capture logger (M3)
|
||||
LOG_FILE = LOG_DIR / "capture.jsonl"
|
||||
# Guided diagnostic (M6/D12): a focused capture writes here, separate from the always-on
|
||||
# crash log, so its report covers only that session's window.
|
||||
DIAG_LOG = LOG_DIR / "diagnostic.jsonl"
|
||||
STATUS_FILE = STATE_DIR / "recorder.json"
|
||||
PID_FILE = STATE_DIR / "recorder.pid"
|
||||
SPAWN_LOG = STATE_DIR / "recorder.out"
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Guided diagnostic session (SPEC §4 / ARCHITECTURE §7.1): orchestrate M3 + M4.
|
||||
|
||||
The seed use case, one flow: **pick a game** → **focused crash-capture** scoped to that
|
||||
session (M3, tagged with the game) → on **finish**, **scan & analyze** (M4 health report)
|
||||
over the captured window + system logs → return a prioritized result. This is not a new
|
||||
module — it's a single shared callable so the CLI, GUI, and tray run the identical flow.
|
||||
|
||||
The capture is **manually bracketed** (start/finish) for now; auto start/stop on game launch
|
||||
(the D12 wrapper/watcher) plugs in here later without changing the result shape.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .. import config
|
||||
from . import reccontrol
|
||||
from .crashlog import Summary, summarize
|
||||
from .health import Finding
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiagnosticResult:
|
||||
game: str | None
|
||||
summary: Summary # capture window: peak temps/power, events, last samples (M3)
|
||||
findings: list[Finding] # health findings: Xid/SMART/driver/etc. (M4)
|
||||
|
||||
|
||||
def _clear_diag_log() -> None:
|
||||
"""Each diagnostic is a fresh focused capture — drop any previous session + segments."""
|
||||
base = config.DIAG_LOG
|
||||
for p in [base, *base.parent.glob(base.name + ".*")]:
|
||||
try:
|
||||
p.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def start(game: str | None = None, interval: float | None = None) -> int | None:
|
||||
"""Begin a focused capture, tagged with the game, into the dedicated diagnostic log.
|
||||
Returns the pid, or None if a capture is already running."""
|
||||
if reccontrol.running_pid():
|
||||
return None
|
||||
_clear_diag_log()
|
||||
return reccontrol.start_background(interval=interval, out=str(config.DIAG_LOG), game=game)
|
||||
|
||||
|
||||
def is_running() -> bool:
|
||||
return reccontrol.running_pid() is not None
|
||||
|
||||
|
||||
def active() -> dict | None:
|
||||
"""Status of the in-progress session (running flag, game, samples), or None if idle."""
|
||||
if not is_running():
|
||||
return None
|
||||
return reccontrol.read_status()
|
||||
|
||||
|
||||
def _await_stopped(timeout: float = 6.0) -> None:
|
||||
deadline = time.monotonic() + timeout
|
||||
while reccontrol.running_pid() and time.monotonic() < deadline:
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def _game_from_summary(summary: Summary) -> str | None:
|
||||
"""Recover the focused game from the log's 'game' event (survives a crash + reboot)."""
|
||||
for _ts, kind, detail in reversed(summary.events):
|
||||
if kind == "game" and detail:
|
||||
return detail
|
||||
return None
|
||||
|
||||
|
||||
def finish(last_n: int = 10, log_path=None) -> DiagnosticResult:
|
||||
"""Stop the capture (if running), summarize the window, and run the health report."""
|
||||
from .health import run_health_checks
|
||||
|
||||
reccontrol.stop_background()
|
||||
_await_stopped()
|
||||
path = log_path or config.DIAG_LOG
|
||||
summary = summarize(path, last_n=last_n)
|
||||
game = _game_from_summary(summary) or (reccontrol.read_status() or {}).get("game")
|
||||
findings = run_health_checks()
|
||||
return DiagnosticResult(game=game, summary=summary, findings=findings)
|
||||
@@ -38,7 +38,9 @@ def read_status() -> dict | None:
|
||||
return None
|
||||
|
||||
|
||||
def start_background(interval: float | None = None, out: str | None = None) -> int | None:
|
||||
def start_background(
|
||||
interval: float | None = None, out: str | None = None, game: str | None = None
|
||||
) -> int | None:
|
||||
"""Spawn a detached `record run`. Returns the child pid, or None if already running."""
|
||||
if running_pid():
|
||||
return None
|
||||
@@ -48,6 +50,8 @@ def start_background(interval: float | None = None, out: str | None = None) -> i
|
||||
cmd += ["--interval", str(interval)]
|
||||
if out:
|
||||
cmd += ["--out", out]
|
||||
if game:
|
||||
cmd += ["--game", game]
|
||||
out_fh = open(config.SPAWN_LOG, "a")
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
|
||||
@@ -27,12 +27,14 @@ class Recorder:
|
||||
backups: int = 10,
|
||||
status_path=None,
|
||||
sampler: Sampler | None = None,
|
||||
game: str | None = None,
|
||||
) -> None:
|
||||
self.interval = interval
|
||||
self.sampler = sampler or Sampler(available_sources())
|
||||
self.writer = CrashLogWriter(log_path, max_bytes, backups)
|
||||
self.log_path = Path(log_path)
|
||||
self.status_path = Path(status_path) if status_path else None
|
||||
self.game = game or None
|
||||
self.samples = 0
|
||||
self._stop = threading.Event()
|
||||
self._gpu_lost = False
|
||||
@@ -43,6 +45,8 @@ class Recorder:
|
||||
|
||||
def run(self) -> None:
|
||||
self.writer.write_event("session-start", f"interval={self.interval:g}s")
|
||||
if self.game:
|
||||
self.writer.write_event("game", self.game) # tag the focused-diagnostic target
|
||||
self._write_status(running=True)
|
||||
try:
|
||||
while not self._stop.is_set():
|
||||
@@ -81,6 +85,7 @@ class Recorder:
|
||||
"samples": self.samples,
|
||||
"updated": time.time(),
|
||||
"gpu_lost": self._gpu_lost,
|
||||
"game": self.game,
|
||||
}
|
||||
if sample is not None:
|
||||
data["latest"] = headline(sample)
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Tests for the guided diagnostic orchestration (M3+M4 glue)."""
|
||||
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from rigdoctor.core import diagnostic
|
||||
from rigdoctor.core.crashlog import CrashLogWriter, summarize
|
||||
from rigdoctor.core.health import Finding
|
||||
from rigdoctor.core.sample import Reading, Sample
|
||||
|
||||
|
||||
def _write_log(path: str, game: str) -> None:
|
||||
w = CrashLogWriter(path)
|
||||
w.write_event("session-start", "interval=1s")
|
||||
w.write_event("game", game)
|
||||
for temp in (60.0, 72.0, 81.0):
|
||||
w.write_sample(Sample(ts=time.time(), readings=[Reading("gpu", "temp", temp, "°C", "")]))
|
||||
w.write_event("gpu-lost", "nvidia-smi query timed out")
|
||||
w.close()
|
||||
|
||||
|
||||
class GameRecoveryTests(unittest.TestCase):
|
||||
def test_game_recovered_from_log_event(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
log = str(Path(d) / "capture.jsonl")
|
||||
_write_log(log, "Path of Exile 2")
|
||||
summary = summarize(log)
|
||||
self.assertEqual(diagnostic._game_from_summary(summary), "Path of Exile 2")
|
||||
|
||||
def test_no_game_event_returns_none(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
log = str(Path(d) / "capture.jsonl")
|
||||
w = CrashLogWriter(log)
|
||||
w.write_event("session-start")
|
||||
w.close()
|
||||
self.assertIsNone(diagnostic._game_from_summary(summarize(log)))
|
||||
|
||||
|
||||
class FinishTests(unittest.TestCase):
|
||||
def test_finish_combines_summary_and_findings(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
log = Path(d) / "capture.jsonl"
|
||||
_write_log(str(log), "Satisfactory")
|
||||
fake = [Finding("warning", "GPU", "NVIDIA Xid 79 ×1", "fell off the bus")]
|
||||
with mock.patch("rigdoctor.core.health.run_health_checks", return_value=fake), \
|
||||
mock.patch.object(diagnostic.reccontrol, "stop_background", return_value=False), \
|
||||
mock.patch.object(diagnostic.reccontrol, "running_pid", return_value=None):
|
||||
result = diagnostic.finish(log_path=log)
|
||||
self.assertEqual(result.game, "Satisfactory")
|
||||
self.assertEqual(result.summary.samples, 3)
|
||||
self.assertEqual(result.findings, fake)
|
||||
# peak GPU temp captured in the window, GPU-lost event recorded
|
||||
self.assertEqual(result.summary.maxima["gpu.temp"][0], 81.0)
|
||||
self.assertTrue(any(kind == "gpu-lost" for _ts, kind, _d in result.summary.events))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user