diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml new file mode 100644 index 0000000..d3a477d --- /dev/null +++ b/.gitea/workflows/release.yml @@ -0,0 +1,65 @@ +name: release +run-name: Release on push to main + +# Builds a wheel + sdist and publishes a Gitea release v on every push to +# main. The version comes from pyproject.toml (kept in lockstep with __version__, D19); +# if a release for that tag already exists, the job is a no-op — so bump the version +# (and CHANGELOG) to cut a new release. + +on: + push: + branches: [main] + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Build wheel + sdist + run: | + python -m pip install --upgrade build + python -m build + + - name: Read version + id: ver + run: | + V=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])") + echo "version=$V" >> "$GITHUB_OUTPUT" + + - name: Publish Gitea release + env: + TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + API="${{ github.server_url }}/api/v1/repos/${{ github.repository }}" + TAG="v${{ steps.ver.outputs.version }}" + + code=$(curl -sS -o /tmp/existing.json -w '%{http_code}' \ + -H "Authorization: token ${TOKEN}" "${API}/releases/tags/${TAG}") + if [ "$code" = "200" ]; then + echo "Release ${TAG} already exists — nothing to do." + exit 0 + fi + + echo "Creating release ${TAG}…" + rid=$(curl -sS -X POST \ + -H "Authorization: token ${TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{\"tag_name\":\"${TAG}\",\"target_commitish\":\"${{ github.sha }}\",\"name\":\"${TAG}\",\"body\":\"Automated release for ${TAG}. See CHANGELOG.md.\"}" \ + "${API}/releases" | python -c "import sys, json; print(json.load(sys.stdin)['id'])") + + for f in dist/*; do + echo "Uploading $(basename "$f")…" + curl -sS -X POST \ + -H "Authorization: token ${TOKEN}" \ + -F "attachment=@${f}" \ + "${API}/releases/${rid}/assets?name=$(basename "$f")" >/dev/null + done + echo "Published ${TAG}." diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..eb3f06a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,33 @@ +# Changelog + +All notable changes to RigDoctor are recorded here. Format follows +[Keep a Changelog](https://keepachangelog.com/); versioning is SemVer-style +(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git +release tag (so the auto-updater, D18, can compare versions). + +## [0.0.2] - 2026-05-21 +### Added +- **M3 crash-capture logger**: crash-safe JSONL (`fsync` per sample), size-based rotation, + GPU-lost/recovered event markers, atomic status file; `rigdoctor record run|start|stop| + status|report` (foreground `run` is the systemd-ready entrypoint). +- **GUI Recording/Logs page** (M10): start/stop/interval controls, live status, and the + post-crash report — driving the same recorder via shared `core.reccontrol`. +- Shared render helpers (`format_raw`, `format_headline`, `render_summary`) used by CLI + GUI. +- Tests for the crash log (writer, rotation, reader, summary, recorder). +- **Gitea Actions release workflow** (`.gitea/workflows/release.yml`): on push to `main`, + builds wheel + sdist and publishes a Gitea release `v` with the artifacts. +### Changed +- **GUI-first** emphasis (D17): docs reframed; the CLI keeps full parity for headless/SSH. +- CPU core temperatures ordered (package, then core 0, 4, 8, …) at the source — fixes the + CLI ordering too. +- Distribution revised (D8): **user-local self-updating install** is primary, `.deb` optional. +### Planned (docs only) +- M12 session sharing / remote assist (D16); M13 no-root auto-update from the public repo + (D18); versioning/changelog convention (D19). + +## [0.0.1] - 2026-05-21 +### Added +- Initial release: planning docs and decisions (D1–D15); **M1 sensor core** (NVIDIA GPU via + nvidia-smi, CPU via hwmon, memory + DDR5 SPD temps, NVMe); CLI (`snapshot`, `monitor`, + `sources`); and the **M10 desktop GUI** — dark dashboard with circular gauges and + collapsible, temperature-colored cards. diff --git a/README.md b/README.md index 4a9f710..b4e3401 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,10 @@ A **modular diagnostics, monitoring, and health-check toolkit for Linux gamers.** -> **Status:** 🟢 Phase 1 (MVP) in progress. Foundational decisions are settled and the -> **sensor core (M1)** works — `snapshot` / `monitor` read NVIDIA GPU, CPU, memory, and -> NVMe live. Crash logger (M3) and health report (M4) are next. See `docs/ROADMAP.md`. +> **Status:** 🟢 Phase 1 (MVP) in progress. The **sensor core (M1)** and **crash-capture +> logger (M3)** work — `snapshot`/`monitor` read NVIDIA GPU, CPU, memory, and NVMe live, and +> `record` captures a crash-safe log with a post-crash report. A desktop GUI (M10) is also +> up. Health report (M4) is next. See `docs/ROADMAP.md`. ## Why this exists @@ -25,13 +26,14 @@ See `docs/SPEC.md` §1. ## How you run it -Three front-ends over one shared engine — pick what fits: -- **CLI / headless** — full functionality from the terminal, works over SSH. -- **Desktop GUI** — graphical dashboard, log browser, and health-report viewer. -- **Tray applet** — a small applet in the top menu bar with quick actions (e.g. start - recording) and at-a-glance status. +RigDoctor is **GUI-first** — the desktop app is the primary way in — but every feature is +also available headless: +- **Desktop GUI** — graphical dashboard, recording controls, log browser, reports. The + default interface for most users. +- **Tray applet** — a small top-menu-bar applet with quick actions and at-a-glance status. +- **CLI** — full functionality from the terminal; works over SSH and in scripts. -The GUI and tray are optional modules; a headless install loses no diagnostic capability. +The GUI/tray are optional modules; a headless (CLI-only) install loses no capability. ## Key decisions (settled) @@ -42,7 +44,7 @@ The GUI and tray are optional modules; a headless install loses no diagnostic ca | Primary distro | **Ubuntu** (Debian via apt); others best-effort later | | Primary GPU | **NVIDIA** first; AMD, then Intel later | | MVP | **Sensor core + crash logger + health report** (NVIDIA-only, CLI-first) | -| Distribution | **`.deb`** + interactive module installer | +| Distribution | **User-local install** (self-updating from the public repo, no root); **`.deb`** optional | | Scope of action | **Read-only + suggestions** (no auto-apply yet) | | Stress tests | **Out of scope** | @@ -73,6 +75,23 @@ PYTHONPATH=src python3 -m rigdoctor sources # list detected sensor sources PYTHONPATH=src python3 -m unittest discover -s tests ``` +### Crash-capture logger (M3) + +A crash-safe background logger (JSONL, `fsync` per sample, bounded by rotation) for catching +the state right before a freeze: + +```bash +rigdoctor record start # start logging in the background +rigdoctor record status # is it running? latest readings, sample count +rigdoctor record stop # stop it +rigdoctor record report # post-crash summary: peaks, events, last samples +rigdoctor record run # run in the foreground (the systemd-ready entrypoint) +``` + +Logs live in `~/.local/share/rigdoctor/logs/`. It detects GPU "lost"/hang (nvidia-smi query +timeout) and writes an event marker. Trigger modes (always-on / game-launch) and the +`systemd --user` service arrive in Phase 4. + ### Desktop GUI (M10) The GUI uses PySide6 (Qt) — the only part of RigDoctor that needs a non-stdlib dep: @@ -85,7 +104,8 @@ rigdoctor gui # or: rigdoctor-gui It opens a dark-themed window with sidebar navigation and a **live dashboard** over the same sensor core — circular gauges for the headline metrics plus collapsible per-subsystem cards (GPU/CPU/memory/storage) with temperature-colored values (icey-blue → green → red). -The Logs / Health / Inventory sections are placeholders until M3–M5 land. +The **Logs** section is a full recording page (start/stop, live status, and the post-crash +report); Health / Inventory are placeholders until M4 / M5 land. Without the GUI extra, `pip install -e .` gives just the stdlib-only CLI. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index f453814..c433847 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -129,8 +129,9 @@ as a single callable so all three front-ends share one implementation. optionally **enable** the `systemd --user` logger service and choose its trigger mode (D6). 5. **Verify** each installed module's `probe()` and print a readiness summary. -Module list/bundling is final (D14). Packaging is `.deb`-first (D8); the wizard layers -module selection on top of the package. +Module list/bundling is final (D14). Packaging: a **user-local install is primary** +(self-updating from the public repo, no root — D8/D18), with an **optional `.deb`** system +package; the wizard layers module selection on top of either. ## 9. GPU vendor abstraction | Capability | NVIDIA (first) | AMD (later) | Intel (later) | diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md index 583deeb..494e55d 100644 --- a/docs/DECISIONS.md +++ b/docs/DECISIONS.md @@ -1,8 +1,9 @@ # RigDoctor — Decisions & Open Questions Format: each item is **OPEN** (needs a call) or **DECIDED** (with date + rationale). -Decisions D1–D15 were all settled on 2026-05-21; the original open questions are kept below -with their resolutions so the reasoning is traceable. No tracked decisions are currently open. +Decisions D1–D19 are settled (D1–D15 on 2026-05-21); the original open questions are kept +below with their resolutions so the reasoning is traceable. No tracked decisions are +currently open. ## Decided @@ -34,9 +35,10 @@ AMD and Intel come later behind the vendor abstraction; nothing should hard-code way that blocks them. ### D5 — MVP scope — *DECIDED 2026-05-21* -**M1 + M3 + M4 (the *Essential* bundle), NVIDIA-only**, CLI-first. This is the first build -target — it captures the seed crash and explains the logs before any installer, GUI, tray, -or multi-vendor work. +**M1 + M3 + M4 (the *Essential* bundle), NVIDIA-only.** This was the first build target — it +captures the seed crash and explains the logs before any installer, multi-vendor, etc. work. +*(The MVP was built CLI-first; per D17 the GUI is now the primary interface going forward — +the CLI keeps full parity.)* ### D6 — Crash-logger trigger model — *DECIDED 2026-05-21* **Let the user choose.** All three modes are supported and selectable (installer + config): @@ -50,10 +52,13 @@ or multi-vendor work. generators. Users who want to reproduce load can run existing tools (gpu-burn, vkmark, stress-ng) themselves alongside the logger. -### D8 — Distribution / packaging — *DECIDED 2026-05-21* -**`.deb` package** as the primary distribution channel (matches the Ubuntu-first focus). The -`.deb` declares dependencies per module group; the interactive installer (M9) handles module -selection on top. AUR / Flatpak / COPR are possible later, not now. +### D8 — Distribution / packaging — *DECIDED 2026-05-21; revised 2026-05-21 (see D18)* +**Primary: a user-local install** (pipx/venv or a versioned bundle under `~/.local`, owned by +the user) so the app can **self-update from the public Gitea releases with no root** (D18). A +**`.deb` remains an optional** system-install channel for users who prefer it (updated via +apt). *Why the revision:* the repo is public and we want frictionless, GUI-first self-updates, +which a root-owned system package can't apply silently. The interactive installer (M9) layers +module selection on top of either channel. AUR / Flatpak / COPR still later, if warranted. ### D9 — Scope of action (read-only vs apply-fixes) — *DECIDED 2026-05-21* **Read-only + suggestions.** RigDoctor diagnoses, monitors, and **suggests** actions in @@ -118,10 +123,69 @@ build or maintain mappings for other package managers. A thin seam is left in th another package manager *could* be added later, but multi-distro support is **not** a planned deliverable. Revisit only if Ubuntu-only proves too narrow. +### D16 — Session sharing / remote assist (M12) — *DECIDED 2026-05-21* +Build a **session-sharing / remote-assist** capability (new module **M12**) so a user (A) +can let a helper (B) inspect their machine. **Full ladder, built in order:** +1. **Diagnostic bundle export** — `share export` packages inventory (M5) + recent capture + log (M3) + a report into one file A sends to B; B opens it in RigDoctor. One-way, no live + connection. Safest; build first. +2. **Live read-only view** — a small local server serving the live dashboard + logs + read-only, reached over a **user-chosen tunnel** (Tailscale / cloudflared / SSH reverse + tunnel — *no RigDoctor-hosted relay*, to keep the no-telemetry promise). Token-gated, + short TTL, A approves and can kill instantly. No terminal. +3. **Gated interactive terminal** — wrap an existing trusted tool (`tmate`/`sshx`) rather + than rolling our own; **read-only link by default**, read-write requires explicit + per-session consent. This is a deliberate, consent-gated exception to the read-only stance + (D9) — it's full machine access and must be treated as such. + +*Cross-cutting principles:* explicit per-session consent; ephemeral, revocable tokens; +clear permission escalation (view ≠ shell); no mandatory central relay; session audit log. +*Note:* this adds M12 on top of the "final" list from D14; the catalog is updated accordingly. + +### D17 — GUI-first interface emphasis — *DECIDED 2026-05-21* +The **desktop GUI (M10) is the primary, default interface** for end users — it's the more +user-friendly way in, and **every capability** (recording, reports, status, …) must be +reachable from it. This **supersedes the earlier "CLI-first / terminal-first" framing** +(updates D5 and the SPEC wording). +- *The CLI is not removed:* it keeps **full functionality** for headless / SSH / server / + scripting use, and it's the engine the background daemon runs on. +- *No change to layering (D2):* the core, CLI, and daemon stay **stdlib-only** and must run + without Qt. "GUI-first" is about emphasis and front-end parity, not dropping headless support. + +### D18 — Auto-update (M13) — *PLANNED 2026-05-21* +RigDoctor should **check for a newer version on launch and self-update** (new module **M13**). +**Mechanism (chosen): user-local, no-root self-update from the public repo.** +- *Install model (D8 revised):* primary install is **user-local** (`~/.local`), so the running + app can replace its own files and update with **no apt, no root, no password prompt**. +- *Check:* on launch, query the **public Gitea releases API** + (`/api/v1/repos/jessey/rigdoctor/releases/latest`) over HTTPS; compare to the running version. +- *Apply:* download the new release bundle, **verify checksum/signature**, stage it + (e.g. `~/.local/share/rigdoctor/versions/x.y.z`), swap a symlink atomically, then restart + (including the `systemd --user` daemon). +- *GUI-first (D17):* a non-intrusive "update available" prompt + one-click apply; `rigdoctor + update` in the CLI. +- *Security:* HTTPS only; verify checksum/signature before swapping; never run unverified code. +- *Privacy (no telemetry):* version-check only — no tracking; auto-check is opt-out-able. +- *`.deb` users:* the optional `.deb` channel updates via apt instead; auto-update targets the + user-local install. +- *Caveat (to confirm before building):* the Gitea instance currently **requires sign-in for + API calls** (`"Only signed in user is allowed to call APIs."`), so anonymous version checks + need the instance/repo set to allow anonymous access — or a separate public version endpoint + (e.g. a static file or a mirror). + +### D19 — Versioning & changelog — *DECIDED 2026-05-21* +**Track a version number on every change.** SemVer-style `MAJOR.MINOR.PATCH` (pre-1.0: bump +PATCH for ordinary changes, MINOR for larger milestones). `__version__` +(`rigdoctor/__init__.py`) and `pyproject.toml` are the single source of truth and **must match +the git release tag** so the auto-updater (D18) can compare versions. Every change updates +`CHANGELOG.md` (Keep a Changelog style). *Note:* an early placeholder `0.1.0` was corrected to +follow the released **0.0.x** line — first release was **V0.0.1**; current is **0.0.2**. + ## Open -None currently — all tracked decisions (D1–D15) are resolved. New questions will be added +None currently — all tracked decisions (D1–D19) are resolved. New questions will be added here as they arise. Remaining detail to flesh out during build: the tray's supporting-action -set (D13 proposed list) and per-module apt package names (filled in as modules land). +set (D13), per-module apt package names, M12's tunnel/token specifics, and M13's +update mechanism (APT repo vs. self-installed `.deb`). diff --git a/docs/MODULES.md b/docs/MODULES.md index 2f86cb7..223aad2 100644 --- a/docs/MODULES.md +++ b/docs/MODULES.md @@ -2,14 +2,14 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done -> Final module set (D14). **M7 (stress/repro) was dropped (D7).** M10/M11 are the GUI and -> tray modules (D10/D11). GPU scope reads "all (NVIDIA first)" — NVIDIA is implemented first, -> others via the vendor abstraction (D4). +> Module set per D14, plus **M12 (session sharing, D16)** and **M13 (auto-update, D18)**. +> **M7 (stress/repro) was dropped (D7).** M10/M11 are the GUI and tray modules (D10/D11). +> GPU scope reads "all (NVIDIA first)" — NVIDIA first, others via the vendor abstraction (D4). | ID | Module | Bundle | Key deps | GPU scope | Priority | Status | |----|--------|--------|----------|-----------|----------|--------| | M1 | Sensor core | Essential | none (nvidia-smi, sysfs) | all (NVIDIA first) | P0 | ⬜ | -| M3 | Crash-capture logger | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | ⬜ | +| M3 | Crash-capture logger | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | 🟨 | | M4 | Health report (log scan) | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | ⬜ | | M2 | Live monitor (TUI) | Monitoring | none (stdlib curses) | all | P1 | ⬜ | | M8 | Alerting | Monitoring | libnotify (opt) | all | P2 | ⬜ | @@ -18,6 +18,8 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done | M10 | Desktop GUI | Desktop UI | **python3-pyside6** | all | P2 | 🟨 | | M11 | Tray / menu-bar applet | Desktop UI | **python3-pyside6** (+ AppIndicator on GNOME) | all | P2 | ⬜ | | M9 | Installer | (meta) | none | all | P1 | ⬜ | +| M12 | Session sharing / remote assist | Sharing | none (Tier 3: tmate/sshx) | all | P3 | ⬜ | +| M13 | Auto-update | (core) | none (stdlib; user-local file swap) | all | P3 | ⬜ | | ~~M7~~ | ~~Stress / repro~~ | — | — | — | — | ❌ dropped (D7) | ## Notes per module @@ -26,6 +28,11 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done - **M3 Crash-capture logger** — the highest-value piece for the seed use case. `fsync` per sample; GPU-lost detection via query timeout; bounded rotation; `systemd --user` service with a **user-selectable trigger mode** (always-on / game-launch / manual — D6). + *Implemented (manual trigger):* JSONL log with fsync-per-sample, size-based rotation + (`log_max_bytes`/`log_backups`), GPU-lost/recovered event markers, atomic status file, and + `rigdoctor record run|start|stop|status|report`. The foreground `run` is the systemd-ready + entrypoint; the service unit + always-on/game-launch triggers (D6/D12) land in Phase 4. + Also fully driven from the GUI's Recording/Logs page (M10) via shared `core.reccontrol`. - **M4 Health report** — turns scattered logs into a prioritized, plain-language findings list with **suggested** fixes (read-only, D9). Reuses M1 for a live snapshot. Also powers the **guided diagnostic session** (with M3): pick a game → focused capture → scan → @@ -37,20 +44,33 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done - **M10 Desktop GUI** — PySide6 graphical front-end over the core engine (dashboard, log browser, report viewer, logger controls). Optional; adds the Qt dependency. *Bootstrapped early (ahead of its Phase 4 slot) at the user's request:* dark-themed window with sidebar - nav and a live dashboard (circular gauges + collapsible per-subsystem cards, temperature- - colored values); Logs/Health/Inventory are placeholders until M3–M5. + nav, a live dashboard (circular gauges + collapsible per-subsystem cards, temperature- + colored values), and a **Recording/Logs page** with full M3 controls (start/stop/status + + post-crash report). Health/Inventory remain placeholders until M4/M5. GUI-first per D17. - **M11 Tray applet** — `QSystemTrayIcon` menu-bar applet. Dropdown shows live M1 readouts (CPU temp, GPU temp, memory used/total, status dot) and is led by a **Run Diagnostic** action (the guided diagnostic session), plus Open dashboard / Start-Stop recording / Snapshot / Quit (D13). Optional; shares the Qt dependency with M10. - **M9 Installer** — interactive wizard layered on the `.deb` (D8); apt-first dependency resolution; enables the logger service and trigger mode. +- **M12 Session sharing / remote assist** (D16) — let a helper inspect a user's machine, in + an escalating ladder: (1) **diagnostic bundle export** (inventory + recent log + report, + one-way), (2) **live read-only view** over a user-chosen tunnel (Tailscale/cloudflared/SSH, + no hosted relay), (3) **gated interactive terminal** wrapping tmate/sshx (read-only by + default; read-write only on explicit consent — a deliberate exception to D9). Per-session + consent, ephemeral revocable tokens, audit log. +- **M13 Auto-update** (D18) — *planned.* On launch, check the public Gitea releases API and + **self-update a user-local install with no root** (download → verify checksum/signature → + atomic symlink swap → restart, incl. the daemon). HTTPS-only, version-check-only (no + telemetry), opt-out-able. Surfaced in the GUI; `rigdoctor update` in the CLI. (`.deb` users + update via apt instead.) ## Bundles (final — D14) - **Essential:** M1 + M3 + M4 *(the MVP, NVIDIA-only — D5)* - **Monitoring:** M2 + M8 - **Diagnostics:** M5 + M6 - **Desktop UI:** M10 + M11 *(adds PySide6)* +- **Sharing:** M12 *(session sharing / remote assist — D16)* ## MVP candidate — *confirmed (D5)* **M1 + M3 + M4 (Essential), NVIDIA-only, CLI-first.** Gives a working tool that captures the diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 637599f..8b44d38 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -11,13 +11,13 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`). - [x] Lock the MVP scope (M1 + M3 + M4, NVIDIA-only) ## Phase 1 — MVP: capture *this* crash (Essential bundle, NVIDIA-only, CLI) -- [ ] M1 sensor core (NVIDIA via nvidia-smi + hwmon for CPU/RAM/NVMe), stdlib-only -- [ ] M3 crash-capture logger (CSV, fsync per sample, GPU-lost detection, rotation, - `systemd --user` service) -- [ ] Manual trigger mode first (`rigdoctor record start/stop`); other modes in Phase 4 +- [x] M1 sensor core (NVIDIA via nvidia-smi + hwmon for CPU/RAM/NVMe), stdlib-only +- [x] M3 crash-capture logger (JSONL, fsync per sample, GPU-lost detection, size rotation) +- [x] Manual trigger mode (`rigdoctor record run/start/stop/status`); `systemd --user` + service + other trigger modes in Phase 4 (`run` is already the service entrypoint) - [ ] M4 health report (Xid/panic/OOM/MCE/AER/thermal scan + driver-mismatch + snapshot, suggested fixes only — D9) -- [ ] `--report` post-crash summary (max temps/power, throttle events, last N samples) +- [x] `record report` post-crash summary (peak temps/power per subsystem, events, last N samples) - **Exit criteria:** user can run it during gaming and, after a freeze/black-screen, see the last readings + a plausible cause. @@ -46,9 +46,20 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`). ## Phase 5 — Breadth (later) - [ ] AMD GPU support in M1 (Steam Deck / Radeon) - [ ] Intel GPU best-effort +- [ ] M13 auto-update (D18) — launch-time version check + no-root self-update of the + user-local install from the public Gitea releases; GUI prompt + `rigdoctor update` - [ ] (Later, separate milestone) Optional auto-apply of suggested fixes behind explicit consent — currently out of scope (D9) +## Phase 6 — Session sharing / remote assist (M12, D16) +Escalating ladder, built in order: +- [ ] Tier 1: `share export` — diagnostic bundle (inventory + recent log + report); B opens + it in RigDoctor. One-way, safest. +- [ ] Tier 2: live read-only view (local server + user-chosen tunnel: Tailscale/cloudflared/ + SSH; no hosted relay), token-gated, A approves, revocable. +- [ ] Tier 3: gated interactive terminal (wrap tmate/sshx; read-only default, read-write on + explicit consent), with session audit log. + > **Out of scope:** stress/repro module (D7); multi-distro support and packaging beyond > Ubuntu/apt + `.deb` (D15) — a thin seam is kept but not built out. diff --git a/docs/SPEC.md b/docs/SPEC.md index ed9371f..a1d1a60 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -31,8 +31,9 @@ RigDoctor's crash-safe logger is designed to fix exactly that. - Catch and preserve the machine's state in the seconds before a hard freeze. - Make hard-to-investigate gaming faults debuggable: collect scattered signals, correlate them, and explain them. -- Offer **three ways to run**: full **CLI / headless** (works over SSH), a **desktop GUI**, - and a **system-tray / top-menu-bar applet** with quick actions. (D10/D11) +- Be **GUI-first** (D17): the **desktop GUI** is the primary interface, complemented by a + **system-tray / top-menu-bar applet** for quick actions — backed by a **full CLI** that + keeps complete functionality for headless / SSH / scripting use. (D10/D11/D17) - Be modular: a novice installs a one-click "monitor + capture + report" bundle; a power user installs everything including the GUI, tray, and diagnostics. - Low overhead; safe defaults; no telemetry/phone-home. @@ -135,7 +136,18 @@ rather than adding a new one. Interactive wizard: detect GPU vendor (NVIDIA-first) → present module menu grouped into bundles with descriptions and the exact packages each needs → resolve & install (apt first) → write config → optionally enable the `systemd --user` logger service and pick its trigger -mode. Delivered alongside the `.deb` (D8). Module list/bundling is final per D14. +mode. Delivered with the user-local install (and the optional `.deb`) (D8). Module +list/bundling is final per D14. + +### M12 — Session sharing / remote assist (D16) +Lets a user (A) grant a helper (B) inspection access, as an escalating, consent-driven +ladder: (1) **diagnostic bundle export** (inventory + recent capture log + report, one-way); +(2) **live read-only view** of the dashboard + logs over a user-chosen tunnel +(Tailscale/cloudflared/SSH — no RigDoctor-hosted relay); (3) **gated interactive terminal** +wrapping an existing tool (tmate/sshx), read-only by default, read-write only on explicit +consent. Per-session consent, ephemeral revocable tokens, permission escalation (view ≠ +shell), and a session audit log. Tier 3 is a deliberate, consent-gated exception to the +read-only stance (D9). Built in Phase 6. ## 5. Non-functional requirements - **Zero hard deps for the core/CLI/daemon** — Python stdlib + tools already present. **Qt @@ -144,8 +156,9 @@ mode. Delivered alongside the `.deb` (D8). Module list/bundling is final per D14 - **Crash-safe logging** — flush + `fsync` per sample; bounded disk usage. - **Low overhead** — default ≤1 Hz sampling; negligible CPU/GPU cost. The always-on daemon is stdlib-only (no Qt loaded) so it stays tiny. -- **Headless-equivalent** — every diagnostic capability is reachable from the CLI; the GUI - and tray are conveniences over the same engine, never the only way to do something. +- **GUI-first, CLI-complete** (D17) — the GUI is the primary interface, but every capability + is *also* reachable from the CLI so RigDoctor runs fully headless (SSH/servers). Both + front-ends sit over the same engine; neither is the only way to do something. - **Privacy** — local only; inventory export is opt-in and reviewable; no telemetry. - **Portability** — graceful degradation when a sensor/tool is unavailable (N/A, not crash). diff --git a/pyproject.toml b/pyproject.toml index 7d20eba..6e4370d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "rigdoctor" -version = "0.1.0" +version = "0.0.2" description = "Modular hardware monitoring & crash diagnostics for Linux gamers." readme = "README.md" requires-python = ">=3.11" diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py index 460e6ed..0e85ae4 100644 --- a/src/rigdoctor/__init__.py +++ b/src/rigdoctor/__init__.py @@ -1,3 +1,3 @@ """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers.""" -__version__ = "0.1.0" +__version__ = "0.0.2" diff --git a/src/rigdoctor/cli.py b/src/rigdoctor/cli.py index fc73040..05e1c06 100644 --- a/src/rigdoctor/cli.py +++ b/src/rigdoctor/cli.py @@ -4,13 +4,18 @@ from __future__ import annotations import argparse import json +import os +import signal import sys +import time +from pathlib import Path -from . import __version__ +from . import __version__, config from .config import load_config +from .core import reccontrol from .core.sampler import Sampler from .core.sources import available_sources -from .render import render_snapshot +from .render import format_headline, render_snapshot, render_summary def _sampler() -> Sampler: @@ -64,9 +69,99 @@ def cmd_gui(args) -> int: return gui_main([sys.argv[0]]) -def cmd_record(args) -> int: - print("`record` (M3 crash-capture logger) is not implemented yet — next on the roadmap.") - return 2 +# --- M3 crash-capture logger --------------------------------------------------- + +def cmd_record_run(args) -> int: + cfg = load_config() + interval = args.interval or cfg["interval"] + log_path = Path(args.out) if args.out else config.LOG_FILE + config.STATE_DIR.mkdir(parents=True, exist_ok=True) + config.PID_FILE.write_text(str(os.getpid())) + + from .core.recorder import Recorder + + recorder = Recorder( + interval=interval, + log_path=log_path, + max_bytes=cfg["log_max_bytes"], + backups=cfg["log_backups"], + status_path=config.STATUS_FILE, + ) + + def _handle(_sig, _frame): + recorder.stop() + + signal.signal(signal.SIGTERM, _handle) + signal.signal(signal.SIGINT, _handle) + + print(f"Recording to {log_path} every {interval:g}s — stop with Ctrl-C or `rigdoctor record stop`.") + try: + recorder.run() + finally: + try: + config.PID_FILE.unlink() + except OSError: + pass + print(f"Stopped after {recorder.samples} samples.") + return 0 + + +def cmd_record_start(args) -> int: + if reccontrol.running_pid(): + print(f"Recorder already running (pid {reccontrol.running_pid()}).") + return 0 + pid = reccontrol.start_background(args.interval, args.out) + time.sleep(1.0) # let it come up + if pid and reccontrol.pid_alive(pid): + print(f"Recording started in the background (pid {pid}).") + print(f" log: {args.out or config.LOG_FILE}") + print(" status: rigdoctor record status · stop: rigdoctor record stop") + return 0 + print(f"Recorder failed to start; see {config.SPAWN_LOG}") + return 1 + + +def cmd_record_stop(args) -> int: + pid = reccontrol.running_pid() + if not pid: + print("Recorder is not running.") + return 0 + if not reccontrol.stop_background(): + print(f"Could not stop recorder (pid {pid}).") + return 1 + for _ in range(50): + if not reccontrol.pid_alive(pid): + break + time.sleep(0.1) + print(f"Recorder stopped (pid {pid}).") + return 0 + + +def cmd_record_status(args) -> int: + pid = reccontrol.running_pid() + status = reccontrol.read_status() + print(f"● recording (pid {pid})" if pid else "○ not recording") + if status: + print(f" log: {status.get('log')}") + print(f" samples: {status.get('samples')}") + if status.get("started"): + print(f" started: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(status['started']))}") + if status.get("updated"): + print(f" updated: {time.strftime('%H:%M:%S', time.localtime(status['updated']))}") + if status.get("gpu_lost"): + print(" ⚠ a GPU-lost event was recorded this session") + if status.get("latest"): + print(f" latest: {format_headline(status['latest'])}") + return 0 + + +def cmd_record_report(args) -> int: + from .core.crashlog import summarize + + log_path = Path(args.log) if args.log else config.LOG_FILE + summary = summarize(log_path, last_n=args.last) + print(render_summary(summary, log_path=log_path)) + return 0 def cmd_report(args) -> int: @@ -92,7 +187,28 @@ def build_parser() -> argparse.ArgumentParser: sub.add_parser("gui", help="launch the desktop GUI (needs PySide6)").set_defaults(func=cmd_gui) sub.add_parser("sources", help="list detected sensor sources").set_defaults(func=cmd_sources) - sub.add_parser("record", help="crash-capture logger (coming soon)").set_defaults(func=cmd_record) + + rec = sub.add_parser("record", help="crash-capture logger (M3)") + rec_sub = rec.add_subparsers(dest="record_cmd", required=True) + + run_p = rec_sub.add_parser("run", help="run the capture loop in the foreground (systemd-friendly)") + run_p.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)") + run_p.add_argument("-o", "--out", default=None, help="log file path") + run_p.set_defaults(func=cmd_record_run) + + start_p = rec_sub.add_parser("start", help="start recording in the background") + start_p.add_argument("-n", "--interval", type=float, default=None, help="sampling interval (s)") + start_p.add_argument("-o", "--out", default=None, help="log file path") + start_p.set_defaults(func=cmd_record_start) + + rec_sub.add_parser("stop", help="stop background recording").set_defaults(func=cmd_record_stop) + rec_sub.add_parser("status", help="show recorder status").set_defaults(func=cmd_record_status) + + report_p = rec_sub.add_parser("report", help="summarize the captured log (post-crash)") + report_p.add_argument("--last", type=int, default=10, help="recent samples to show") + report_p.add_argument("--log", default=None, help="path to a capture log") + report_p.set_defaults(func=cmd_record_report) + sub.add_parser("report", help="health report (coming soon)").set_defaults(func=cmd_report) return p diff --git a/src/rigdoctor/config.py b/src/rigdoctor/config.py index 23cc60d..f57f246 100644 --- a/src/rigdoctor/config.py +++ b/src/rigdoctor/config.py @@ -19,8 +19,16 @@ STATE_DIR = _xdg("XDG_STATE_HOME", ".local/state") LOG_DIR = DATA_DIR / "logs" CONFIG_FILE = CONFIG_DIR / "config.toml" +# Crash-capture logger (M3) +LOG_FILE = LOG_DIR / "capture.jsonl" +STATUS_FILE = STATE_DIR / "recorder.json" +PID_FILE = STATE_DIR / "recorder.pid" +SPAWN_LOG = STATE_DIR / "recorder.out" + DEFAULTS: dict = { - "interval": 1.0, # sampling interval in seconds (default ≤1 Hz, low overhead — NFR) + "interval": 1.0, # sampling interval in seconds (default ≤1 Hz — NFR) + "log_max_bytes": 20_000_000, # rotate a log segment past this size + "log_backups": 10, # keep this many rotated segments (bounds disk use) } diff --git a/src/rigdoctor/core/crashlog.py b/src/rigdoctor/core/crashlog.py new file mode 100644 index 0000000..da55bab --- /dev/null +++ b/src/rigdoctor/core/crashlog.py @@ -0,0 +1,177 @@ +"""Crash-capture log (M3): rotating, fsync-per-sample JSONL writer + reader + summary. + +On-disk format is JSON Lines, one record per line: + sample : {"ts": , "readings": [[source, metric, value, unit, label], ...]} + event : {"ts": , "event": , "detail": } + +Every line is flushed and fsync'd, so the readings right before a hard lock survive. +A torn final line (interrupted mid-write by a crash) is tolerated on read. +""" + +from __future__ import annotations + +import json +import os +import time +from collections import deque +from dataclasses import dataclass +from pathlib import Path + +from .sample import Reading, Sample + + +class CrashLogWriter: + """Append samples/events as JSONL, fsync per line, rotate by size.""" + + def __init__(self, path, max_bytes: int = 20_000_000, backups: int = 10) -> None: + self.path = Path(path) + self.max_bytes = int(max_bytes) + self.backups = int(backups) + self.path.parent.mkdir(parents=True, exist_ok=True) + self._fh = open(self.path, "a", encoding="utf-8") + + def _write(self, obj: dict) -> None: + self._fh.write(json.dumps(obj, separators=(",", ":"), ensure_ascii=False)) + self._fh.write("\n") + self._fh.flush() + os.fsync(self._fh.fileno()) # survive a hard lock + if self.max_bytes and self._fh.tell() >= self.max_bytes: + self._rotate() + + def write_sample(self, sample: Sample) -> None: + rows = [[r.source, r.metric, r.value, r.unit, r.label] for r in sample.readings] + self._write({"ts": round(sample.ts, 3), "readings": rows}) + + def write_event(self, kind: str, detail: str = "") -> None: + self._write({"ts": round(time.time(), 3), "event": kind, "detail": detail}) + + def _rotate(self) -> None: + # Mirror logging.handlers.RotatingFileHandler: shift base.i -> base.i+1. + self._fh.close() + base = str(self.path) + for i in range(self.backups - 1, 0, -1): + src = Path(f"{base}.{i}") + dst = Path(f"{base}.{i + 1}") + if src.exists(): + if dst.exists(): + dst.unlink() + src.rename(dst) + if self.backups > 0: + first = Path(f"{base}.1") + if first.exists(): + first.unlink() + self.path.rename(first) + self._fh = open(self.path, "a", encoding="utf-8") + + def close(self) -> None: + try: + self._fh.close() + except Exception: + pass + + +def _segment_files(path) -> list[Path]: + """All log segments oldest→newest: base.N … base.1, base.""" + base = Path(path) + numbered: list[tuple[int, Path]] = [] + for p in base.parent.glob(base.name + ".*"): + suffix = p.name[len(base.name) + 1:] + if suffix.isdigit(): + numbered.append((int(suffix), p)) + numbered.sort(reverse=True) # highest number = oldest + files = [p for _, p in numbered] + if base.exists(): + files.append(base) + return files + + +def iter_records(path, include_backups: bool = True): + """Yield parsed records oldest→newest, tolerating a torn final line.""" + files = _segment_files(path) if include_backups else [Path(path)] + for f in files: + try: + with open(f, encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + yield json.loads(line) + except ValueError: + continue + except OSError: + continue + + +def record_to_sample(rec: dict) -> Sample: + readings = [Reading(s, m, v, u, label) for s, m, v, u, label in rec.get("readings", [])] + return Sample(ts=rec.get("ts", 0.0), readings=readings) + + +def headline(sample: Sample) -> dict: + """Extract the few at-a-glance values used by status/report displays.""" + + def find(source: str, metric: str, label: str | None = None): + for r in sample.readings: + if r.source == source and r.metric == metric and (label is None or r.label == label): + return r.value + return None + + cpu_pkg = None + cpu_temps = [] + for r in sample.readings: + if r.source == "cpu" and r.metric == "temp" and r.value is not None: + cpu_temps.append(r.value) + low = r.label.lower() + if cpu_pkg is None and (low.startswith("package") or "tctl" in low or "tdie" in low): + cpu_pkg = r.value + if cpu_pkg is None and cpu_temps: + cpu_pkg = max(cpu_temps) + + return { + "gpu_temp": find("gpu", "temp", ""), + "gpu_util": find("gpu", "util"), + "gpu_power": find("gpu", "power"), + "cpu_temp": cpu_pkg, + "mem_pct": find("memory", "used_pct"), + } + + +@dataclass +class Summary: + start: float | None + end: float | None + samples: int + maxima: dict # reading.key -> (value, unit, ts) + events: list # [(ts, kind, detail), ...] + last: list # [Sample, ...] most recent + + +def summarize(path, last_n: int = 10) -> Summary: + start = end = None + count = 0 + maxima: dict = {} + events: list = [] + recent: deque = deque(maxlen=last_n) + + for rec in iter_records(path): + ts = rec.get("ts") + if "event" in rec: + events.append((ts, rec.get("event", ""), rec.get("detail", ""))) + continue + if "readings" not in rec: + continue + count += 1 + if start is None: + start = ts + end = ts + sample = record_to_sample(rec) + recent.append(sample) + for r in sample.readings: + if r.value is None: + continue + current = maxima.get(r.key) + if current is None or r.value > current[0]: + maxima[r.key] = (r.value, r.unit, ts) + + return Summary(start, end, count, maxima, events, list(recent)) diff --git a/src/rigdoctor/core/reccontrol.py b/src/rigdoctor/core/reccontrol.py new file mode 100644 index 0000000..ac33be9 --- /dev/null +++ b/src/rigdoctor/core/reccontrol.py @@ -0,0 +1,71 @@ +"""Background-process control for the crash-capture recorder (shared by CLI + GUI). + +Both front-ends start/stop/inspect the same `systemd`-style detached recorder via the +PID and status files, so behaviour is identical however you drive it. +""" + +from __future__ import annotations + +import json +import os +import signal +import subprocess +import sys + +from .. import config + + +def pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + except OSError: + return False + return True + + +def running_pid() -> int | None: + try: + pid = int(config.PID_FILE.read_text().strip()) + except (OSError, ValueError): + return None + return pid if pid_alive(pid) else None + + +def read_status() -> dict | None: + try: + return json.loads(config.STATUS_FILE.read_text()) + except (OSError, ValueError): + return None + + +def start_background(interval: float | None = None, out: str | None = None) -> int | None: + """Spawn a detached `record run`. Returns the child pid, or None if already running.""" + if running_pid(): + return None + config.STATE_DIR.mkdir(parents=True, exist_ok=True) + cmd = [sys.executable, "-m", "rigdoctor", "record", "run"] + if interval: + cmd += ["--interval", str(interval)] + if out: + cmd += ["--out", out] + out_fh = open(config.SPAWN_LOG, "a") + proc = subprocess.Popen( + cmd, + stdout=out_fh, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + start_new_session=True, + ) + return proc.pid + + +def stop_background() -> bool: + """Signal the running recorder to stop. Returns False if it wasn't running.""" + pid = running_pid() + if not pid: + return False + try: + os.kill(pid, signal.SIGTERM) + except OSError: + return False + return True diff --git a/src/rigdoctor/core/recorder.py b/src/rigdoctor/core/recorder.py new file mode 100644 index 0000000..d9f2187 --- /dev/null +++ b/src/rigdoctor/core/recorder.py @@ -0,0 +1,93 @@ +"""Crash-capture recorder (M3): the sampling loop that writes a crash-safe log. + +Runs in the foreground (so it works as a `systemd --user` ExecStart and under +manual `record run`). Stop it by calling stop() — typically from a SIGTERM/SIGINT +handler installed by the CLI. +""" + +from __future__ import annotations + +import json +import os +import threading +import time +from pathlib import Path + +from .crashlog import CrashLogWriter, headline +from .sampler import Sampler +from .sources import available_sources + + +class Recorder: + def __init__( + self, + interval: float, + log_path, + max_bytes: int = 20_000_000, + backups: int = 10, + status_path=None, + sampler: Sampler | None = None, + ) -> None: + self.interval = interval + self.sampler = sampler or Sampler(available_sources()) + self.writer = CrashLogWriter(log_path, max_bytes, backups) + self.log_path = Path(log_path) + self.status_path = Path(status_path) if status_path else None + self.samples = 0 + self._stop = threading.Event() + self._gpu_lost = False + self._started = time.time() + + def stop(self) -> None: + self._stop.set() + + def run(self) -> None: + self.writer.write_event("session-start", f"interval={self.interval:g}s") + self._write_status(running=True) + try: + while not self._stop.is_set(): + t0 = time.monotonic() + sample = self.sampler.sample() + self.writer.write_sample(sample) + self.samples += 1 + self._detect_gpu_lost(sample) + self._write_status(running=True, sample=sample) + self._stop.wait(max(0.0, self.interval - (time.monotonic() - t0))) + finally: + self.writer.write_event("session-stop", f"samples={self.samples}") + self.writer.close() + self._write_status(running=False) + + def _detect_gpu_lost(self, sample) -> None: + lost = any( + r.source == "gpu" and r.metric == "status" and r.label == "query-timeout" + for r in sample.readings + ) + if lost and not self._gpu_lost: + self._gpu_lost = True + self.writer.write_event("gpu-lost", "nvidia-smi query timed out — GPU may be hung/lost") + elif not lost and self._gpu_lost: + self._gpu_lost = False + self.writer.write_event("gpu-recovered", "GPU responding again") + + def _write_status(self, running: bool, sample=None) -> None: + if self.status_path is None: + return + data = { + "running": running, + "pid": os.getpid(), + "log": str(self.log_path), + "started": self._started, + "samples": self.samples, + "updated": time.time(), + "gpu_lost": self._gpu_lost, + } + if sample is not None: + data["latest"] = headline(sample) + try: + self.status_path.parent.mkdir(parents=True, exist_ok=True) + tmp = self.status_path.with_suffix(self.status_path.suffix + ".tmp") + tmp.write_text(json.dumps(data)) + tmp.replace(self.status_path) # atomic + except OSError: + pass diff --git a/src/rigdoctor/gui/main_window.py b/src/rigdoctor/gui/main_window.py index 8143903..ce71d6b 100644 --- a/src/rigdoctor/gui/main_window.py +++ b/src/rigdoctor/gui/main_window.py @@ -16,12 +16,12 @@ from PySide6.QtWidgets import ( ) from .dashboard import Dashboard +from .recorder_page import RecorderPage from .theme import ACCENT, MUTED from .worker import SamplerWorker _NAV_ITEMS = ["Dashboard", "Logs", "Health", "Inventory"] _PLACEHOLDERS = { - "Logs": "Captured crash logs will appear here once the logger (M3) lands.", "Health": "The health report (M4) — log scan + plain-language findings — lands here.", "Inventory": "System inventory (M5) — CPU/GPU/board/RAM/drivers — lands here.", } @@ -46,9 +46,11 @@ class MainWindow(QMainWindow): content_layout.setContentsMargins(0, 0, 0, 0) self._stack = QStackedWidget() self.dashboard = Dashboard() - self._stack.addWidget(self.dashboard) - for name in _NAV_ITEMS[1:]: - self._stack.addWidget(self._placeholder_page(name, _PLACEHOLDERS[name])) + self.recorder_page = RecorderPage() + self._stack.addWidget(self.dashboard) # 0 Dashboard + self._stack.addWidget(self.recorder_page) # 1 Logs + self._stack.addWidget(self._placeholder_page("Health", _PLACEHOLDERS["Health"])) # 2 + self._stack.addWidget(self._placeholder_page("Inventory", _PLACEHOLDERS["Inventory"])) # 3 content_layout.addWidget(self._stack) layout.addWidget(self._build_sidebar()) diff --git a/src/rigdoctor/gui/recorder_page.py b/src/rigdoctor/gui/recorder_page.py new file mode 100644 index 0000000..efcf50c --- /dev/null +++ b/src/rigdoctor/gui/recorder_page.py @@ -0,0 +1,185 @@ +"""Recording & Logs page (M3 in the GUI): start/stop/status + post-crash report. + +Drives the same background recorder as the CLI via core.reccontrol, so the GUI and +`rigdoctor record …` are interchangeable. +""" + +from __future__ import annotations + +import time + +from PySide6.QtCore import Qt, QTimer, QUrl +from PySide6.QtGui import QDesktopServices, QFont +from PySide6.QtWidgets import ( + QDoubleSpinBox, + QFrame, + QHBoxLayout, + QLabel, + QPushButton, + QTextEdit, + QVBoxLayout, + QWidget, +) + +from .. import config +from ..core import reccontrol +from ..core.crashlog import summarize +from ..render import format_headline, render_summary +from .theme import GOOD, MUTED, WARN + + +def _panel(title: str) -> tuple[QFrame, QVBoxLayout]: + frame = QFrame() + frame.setObjectName("Card") + layout = QVBoxLayout(frame) + layout.setContentsMargins(16, 14, 16, 14) + layout.setSpacing(10) + label = QLabel(title) + label.setStyleSheet("font-weight: 700; background: transparent;") + layout.addWidget(label) + return frame, layout + + +def _fmt_time(value, fmt="%Y-%m-%d %H:%M:%S") -> str: + return time.strftime(fmt, time.localtime(value)) if value else "—" + + +class RecorderPage(QWidget): + def __init__(self) -> None: + super().__init__() + self.setObjectName("Page") + root = QVBoxLayout(self) + root.setContentsMargins(20, 18, 20, 18) + root.setSpacing(16) + + title = QLabel("Recording") + title.setObjectName("PageTitle") + root.addWidget(title) + + # --- Status + controls ------------------------------------------------- + status_card, status_layout = _panel("Status") + + self._state = QLabel("○ Not recording") + self._state.setStyleSheet(f"color: {MUTED}; font-weight: 700; background: transparent;") + status_layout.addWidget(self._state) + + self._info = QLabel("") + self._info.setObjectName("Muted") + status_layout.addWidget(self._info) + + self._latest = QLabel("") + status_layout.addWidget(self._latest) + + self._warn = QLabel("") + self._warn.setStyleSheet(f"color: {WARN}; font-weight: 600; background: transparent;") + self._warn.setVisible(False) + status_layout.addWidget(self._warn) + + controls = QHBoxLayout() + controls.setSpacing(8) + controls.addWidget(QLabel("Interval (s)")) + self._interval = QDoubleSpinBox() + self._interval.setRange(0.1, 10.0) + self._interval.setSingleStep(0.1) + self._interval.setValue(float(config.DEFAULTS["interval"])) + controls.addWidget(self._interval) + self._start_btn = QPushButton("Start recording") + self._start_btn.setObjectName("PrimaryButton") + self._start_btn.clicked.connect(self._on_start) + self._stop_btn = QPushButton("Stop") + self._stop_btn.clicked.connect(self._on_stop) + controls.addWidget(self._start_btn) + controls.addWidget(self._stop_btn) + controls.addStretch(1) + folder_btn = QPushButton("Open log folder") + folder_btn.clicked.connect(self._open_folder) + controls.addWidget(folder_btn) + status_layout.addLayout(controls) + root.addWidget(status_card) + + # --- Report ------------------------------------------------------------ + report_card = QFrame() + report_card.setObjectName("Card") + report_layout = QVBoxLayout(report_card) + report_layout.setContentsMargins(16, 14, 16, 14) + report_layout.setSpacing(10) + header = QHBoxLayout() + report_title = QLabel("Post-crash report") + report_title.setStyleSheet("font-weight: 700; background: transparent;") + header.addWidget(report_title) + header.addStretch(1) + refresh_btn = QPushButton("Refresh") + refresh_btn.clicked.connect(self._load_report) + header.addWidget(refresh_btn) + report_layout.addLayout(header) + + self._report = QTextEdit() + self._report.setObjectName("Report") + self._report.setReadOnly(True) + self._report.setFont(QFont("monospace", 10)) + self._report.setLineWrapMode(QTextEdit.LineWrapMode.NoWrap) + report_layout.addWidget(self._report) + root.addWidget(report_card, 1) + + # Poll recorder status once a second (reflects CLI-driven sessions too). + self._timer = QTimer(self) + self._timer.setInterval(1000) + self._timer.timeout.connect(self._refresh_status) + self._timer.start() + self._refresh_status() + self._load_report() + + # --- actions --------------------------------------------------------------- + def _on_start(self) -> None: + self._start_btn.setEnabled(False) + reccontrol.start_background(interval=self._interval.value()) + QTimer.singleShot(600, self._refresh_status) + + def _on_stop(self) -> None: + self._stop_btn.setEnabled(False) + reccontrol.stop_background() + QTimer.singleShot(600, self._refresh_status) + QTimer.singleShot(900, self._load_report) + + def _open_folder(self) -> None: + config.LOG_DIR.mkdir(parents=True, exist_ok=True) + QDesktopServices.openUrl(QUrl.fromLocalFile(str(config.LOG_DIR))) + + # --- refresh --------------------------------------------------------------- + def _refresh_status(self) -> None: + pid = reccontrol.running_pid() + status = reccontrol.read_status() + running = pid is not None + + if running: + self._state.setText(f"● Recording (pid {pid})") + self._state.setStyleSheet(f"color: {GOOD}; font-weight: 700; background: transparent;") + else: + self._state.setText("○ Not recording") + self._state.setStyleSheet(f"color: {MUTED}; font-weight: 700; background: transparent;") + self._start_btn.setEnabled(not running) + self._stop_btn.setEnabled(running) + self._interval.setEnabled(not running) + + if status: + self._info.setText( + f"Samples: {status.get('samples', 0)} " + f"Started: {_fmt_time(status.get('started'))} " + f"Updated: {_fmt_time(status.get('updated'), '%H:%M:%S')}\n" + f"Log: {status.get('log', config.LOG_FILE)}" + ) + latest = status.get("latest") + self._latest.setText(format_headline(latest) if latest else "") + if status.get("gpu_lost"): + self._warn.setText("⚠ A GPU-lost event was recorded this session") + self._warn.setVisible(True) + else: + self._warn.setVisible(False) + else: + self._info.setText("No recording yet. Press “Start recording”.") + self._latest.setText("") + self._warn.setVisible(False) + + def _load_report(self) -> None: + summary = summarize(config.LOG_FILE, last_n=10) + self._report.setPlainText(render_summary(summary, log_path=config.LOG_FILE)) diff --git a/src/rigdoctor/gui/theme.py b/src/rigdoctor/gui/theme.py index 7e76057..6279d7e 100644 --- a/src/rigdoctor/gui/theme.py +++ b/src/rigdoctor/gui/theme.py @@ -88,4 +88,23 @@ QScrollBar::handle:vertical {{ background: {CARD_BORDER}; border-radius: 5px; mi QScrollBar::handle:vertical:hover {{ background: #3a414d; }} QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical {{ height: 0; }} QScrollBar::add-page:vertical, QScrollBar::sub-page:vertical {{ background: transparent; }} + +QPushButton {{ + background: #262b34; color: {TEXT}; border: 1px solid {CARD_BORDER}; + border-radius: 8px; padding: 7px 14px; +}} +QPushButton:hover {{ background: #2f3540; }} +QPushButton:disabled {{ color: #5b626c; background: #1c2026; border-color: #23272f; }} +QPushButton#PrimaryButton {{ background: {ACCENT}; color: #06222e; border: none; font-weight: 700; }} +QPushButton#PrimaryButton:hover {{ background: #5cc8fb; }} +QPushButton#PrimaryButton:disabled {{ background: #27424f; color: #5f7c8a; }} + +QDoubleSpinBox, QSpinBox {{ + background: #262b34; color: {TEXT}; border: 1px solid {CARD_BORDER}; + border-radius: 6px; padding: 4px 6px; +}} + +QTextEdit#Report {{ + background: #0d0f13; color: #cfd3da; border: 1px solid {CARD_BORDER}; border-radius: 8px; +}} """ diff --git a/src/rigdoctor/render.py b/src/rigdoctor/render.py index 72c3fa1..c37776c 100644 --- a/src/rigdoctor/render.py +++ b/src/rigdoctor/render.py @@ -2,21 +2,29 @@ from __future__ import annotations +import time + +from .core.crashlog import Summary, headline from .core.sample import Reading, Sample _GROUP_ORDER = ["gpu", "cpu", "memory", "storage"] _GROUP_TITLES = {"gpu": "GPU", "cpu": "CPU", "memory": "Memory", "storage": "Storage"} +def format_raw(value: float | None, unit: str) -> str: + """Format a value + unit for display.""" + if value is None: + return "N/A" + if unit == "°C": + return f"{value:.1f} °C" + if unit: + return f"{value:g} {unit}" + return f"{value:g}" + + def format_value(r: Reading) -> str: """Format a reading's value + unit for display (shared by CLI and GUI).""" - if r.value is None: - return "N/A" - if r.unit == "°C": - return f"{r.value:.1f} °C" - if r.unit: - return f"{r.value:g} {r.unit}" - return f"{r.value:g}" + return format_raw(r.value, r.unit) def metric_label(r: Reading) -> str: @@ -41,3 +49,91 @@ def render_snapshot(sample: Sample) -> str: lines = [title] + [_fmt(r) for r in groups[key]] blocks.append("\n".join(lines)) return "\n\n".join(blocks) + + +def format_headline(h: dict) -> str: + """One-line headline summary from a headline() dict.""" + + def g(value, unit): + return format_raw(value, unit) if value is not None else "—" + + return ( + f"GPU {g(h.get('gpu_temp'), '°C')} {g(h.get('gpu_util'), '%')} {g(h.get('gpu_power'), 'W')}" + f" · CPU {g(h.get('cpu_temp'), '°C')} · MEM {g(h.get('mem_pct'), '%')}" + ) + + +def _fmt_duration(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m}m {s}s" + if m: + return f"{m}m {s}s" + return f"{s}s" + + +# Metrics worth surfacing as session peaks (by metric name within reading.key). +_PEAK_METRICS = ("temp", "power", "util", "mem_util", "fan", "used_pct") +_SOURCE_ORDER = {"gpu": 0, "cpu": 1, "memory": 2, "storage": 3} + + +def _aggregate_peaks(maxima: dict) -> list[tuple[str, str, float, str, float, str]]: + """Collapse per-label maxima to the single worst value per (source, metric). + + Returns rows of (source, metric, value, unit, ts, label) in display order. + """ + agg: dict[tuple[str, str], tuple[float, str, float, str]] = {} + for key, (value, unit, ts) in maxima.items(): + parts = key.split(".") + if len(parts) < 2 or parts[1] not in _PEAK_METRICS: + continue + source, metric = parts[0], parts[1] + label = ".".join(parts[2:]) + current = agg.get((source, metric)) + if current is None or value > current[0]: + agg[(source, metric)] = (value, unit, ts, label) + rows = [(s, m, v, u, ts, lbl) for (s, m), (v, u, ts, lbl) in agg.items()] + rows.sort(key=lambda r: (_SOURCE_ORDER.get(r[0], 9), r[1])) + return rows + + +def render_summary(summary: Summary, log_path=None) -> str: + if summary.samples == 0 and not summary.events: + where = f" ({log_path})" if log_path else "" + return f"No capture data found{where}. Start one with: rigdoctor record start" + + lines: list[str] = ["Crash-capture report", ""] + if summary.start and summary.end: + start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(summary.start)) + end = time.strftime("%H:%M:%S", time.localtime(summary.end)) + lines.append(f" Window : {start} → {end} ({_fmt_duration(summary.end - summary.start)})") + lines.append(f" Samples : {summary.samples}") + if log_path: + lines.append(f" Log : {log_path}") + + if summary.events: + lines += ["", "Events"] + for ts, kind, detail in summary.events: + stamp = time.strftime("%H:%M:%S", time.localtime(ts)) if ts else "--:--:--" + mark = " ⚠" if "lost" in kind else " " + suffix = f" — {detail}" if detail else "" + lines.append(f" {mark} {stamp} {kind}{suffix}") + + peaks = _aggregate_peaks(summary.maxima) + if peaks: + lines += ["", "Peaks (session maximum)"] + for source, metric, value, unit, ts, label in peaks: + stamp = time.strftime("%H:%M:%S", time.localtime(ts)) if ts else "" + detail = f" ({label})" if label else "" + name = f"{source} {metric}" + lines.append(f" {name:<16} {format_raw(value, unit):>10} at {stamp}{detail}") + + if summary.last: + lines += ["", f"Last {len(summary.last)} samples (most recent last)"] + for sample in summary.last: + stamp = time.strftime("%H:%M:%S", time.localtime(sample.ts)) if sample.ts else "--:--:--" + lines.append(f" {stamp} {format_headline(headline(sample))}") + + return "\n".join(lines) diff --git a/tests/test_crashlog.py b/tests/test_crashlog.py new file mode 100644 index 0000000..8608d37 --- /dev/null +++ b/tests/test_crashlog.py @@ -0,0 +1,103 @@ +"""Tests for the M3 crash-capture log: writer, rotation, reader, summary, recorder.""" + +import tempfile +import threading +import time +import unittest +from pathlib import Path + +from rigdoctor.core.crashlog import CrashLogWriter, iter_records, summarize +from rigdoctor.core.recorder import Recorder +from rigdoctor.core.sample import Reading, Sample +from rigdoctor.core.sampler import Sampler +from rigdoctor.core.sources.base import Source + + +class _FakeSource(Source): + name = "gpu" + + def __init__(self, temp=50.0): + self._temp = temp + + def probe(self): + return True + + def read(self): + return [ + Reading("gpu", "name", None, "", "Fake GPU"), + Reading("gpu", "temp", self._temp, "°C"), + Reading("gpu", "power", 100.0, "W"), + ] + + +class CrashLogTests(unittest.TestCase): + def test_write_and_read_roundtrip(self): + with tempfile.TemporaryDirectory() as d: + path = Path(d) / "capture.jsonl" + w = CrashLogWriter(path) + w.write_event("session-start") + w.write_sample(Sample(ts=1.0, readings=[Reading("gpu", "temp", 60.0, "°C")])) + w.write_event("gpu-lost", "timeout") + w.close() + + records = list(iter_records(path)) + self.assertEqual(records[0]["event"], "session-start") + self.assertEqual(records[1]["readings"][0], ["gpu", "temp", 60.0, "°C", ""]) + self.assertEqual(records[2]["event"], "gpu-lost") + + def test_rotation_bounds_segments(self): + with tempfile.TemporaryDirectory() as d: + path = Path(d) / "capture.jsonl" + w = CrashLogWriter(path, max_bytes=200, backups=2) + for i in range(200): + w.write_sample(Sample(ts=float(i), readings=[Reading("gpu", "temp", float(i), "°C")])) + w.close() + # base + at most `backups` rotated segments + segments = list(Path(d).glob("capture.jsonl*")) + self.assertLessEqual(len(segments), 3) + self.assertTrue((Path(d) / "capture.jsonl").exists()) + # rotation must not lose readability across segments + samples = [r for r in iter_records(path) if "readings" in r] + self.assertGreater(len(samples), 0) + + def test_summary_tracks_peaks_and_events(self): + with tempfile.TemporaryDirectory() as d: + path = Path(d) / "capture.jsonl" + w = CrashLogWriter(path) + w.write_sample(Sample(ts=1.0, readings=[Reading("gpu", "temp", 60.0, "°C")])) + w.write_sample(Sample(ts=2.0, readings=[Reading("gpu", "temp", 81.0, "°C")])) + w.write_event("gpu-lost", "timeout") + w.close() + + s = summarize(path) + self.assertEqual(s.samples, 2) + self.assertEqual(s.maxima["gpu.temp"][0], 81.0) + self.assertEqual(s.events[0][1], "gpu-lost") + self.assertEqual(len(s.last), 2) + + def test_recorder_writes_samples_and_stops(self): + with tempfile.TemporaryDirectory() as d: + path = Path(d) / "capture.jsonl" + status = Path(d) / "status.json" + rec = Recorder( + interval=0.02, + log_path=path, + status_path=status, + sampler=Sampler([_FakeSource()]), + ) + t = threading.Thread(target=rec.run) + t.start() + time.sleep(0.2) + rec.stop() + t.join(timeout=2) + + self.assertFalse(t.is_alive()) + self.assertGreater(rec.samples, 0) + self.assertTrue(status.exists()) + kinds = [r.get("event") for r in iter_records(path) if "event" in r] + self.assertIn("session-start", kinds) + self.assertIn("session-stop", kinds) + + +if __name__ == "__main__": + unittest.main()