From 305b6c4497140d566adf6b24588970cc1dc5c63d Mon Sep 17 00:00:00 2001 From: Jessey van Offeren Date: Thu, 21 May 2026 16:40:21 +0200 Subject: [PATCH] Initial commit: docs, decisions, and M1 sensor core Planning docs (SPEC, ARCHITECTURE, MODULES, ROADMAP, DECISIONS) with decisions D1-D15 settled: RigDoctor name, Python 3 + Qt/PySide6 stack (core/CLI/daemon stdlib-only), Ubuntu + NVIDIA first, .deb packaging, read-only + suggestions, GUI + tray modules, stress module dropped. First code: the M1 sensor core (stdlib-only) and a CLI. - core engine: Reading/Sample model, Sampler, hwmon reader - self-probing sources (NVIDIA first): nvidia-smi GPU, coretemp/k10temp CPU, /proc/meminfo + DDR5 SPD memory, NVMe storage - CLI: snapshot (text/JSON), monitor, sources; record/report stubbed - stdlib unittest smoke tests Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 17 +++ README.md | 83 +++++++++++++ docs/ARCHITECTURE.md | 166 +++++++++++++++++++++++++ docs/DECISIONS.md | 127 +++++++++++++++++++ docs/MODULES.md | 56 +++++++++ docs/ROADMAP.md | 56 +++++++++ docs/SPEC.md | 156 +++++++++++++++++++++++ installer/.gitkeep | 0 pyproject.toml | 22 ++++ src/rigdoctor/__init__.py | 3 + src/rigdoctor/__main__.py | 7 ++ src/rigdoctor/cli.py | 93 ++++++++++++++ src/rigdoctor/config.py | 38 ++++++ src/rigdoctor/core/__init__.py | 1 + src/rigdoctor/core/hwmon.py | 41 ++++++ src/rigdoctor/core/sample.py | 45 +++++++ src/rigdoctor/core/sampler.py | 37 ++++++ src/rigdoctor/core/sources/__init__.py | 31 +++++ src/rigdoctor/core/sources/base.py | 24 ++++ src/rigdoctor/core/sources/cpu.py | 32 +++++ src/rigdoctor/core/sources/memory.py | 48 +++++++ src/rigdoctor/core/sources/nvidia.py | 93 ++++++++++++++ src/rigdoctor/core/sources/storage.py | 34 +++++ src/rigdoctor/render.py | 38 ++++++ tests/test_core.py | 38 ++++++ 25 files changed, 1286 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/DECISIONS.md create mode 100644 docs/MODULES.md create mode 100644 docs/ROADMAP.md create mode 100644 docs/SPEC.md create mode 100644 installer/.gitkeep create mode 100644 pyproject.toml create mode 100644 src/rigdoctor/__init__.py create mode 100644 src/rigdoctor/__main__.py create mode 100644 src/rigdoctor/cli.py create mode 100644 src/rigdoctor/config.py create mode 100644 src/rigdoctor/core/__init__.py create mode 100644 src/rigdoctor/core/hwmon.py create mode 100644 src/rigdoctor/core/sample.py create mode 100644 src/rigdoctor/core/sampler.py create mode 100644 src/rigdoctor/core/sources/__init__.py create mode 100644 src/rigdoctor/core/sources/base.py create mode 100644 src/rigdoctor/core/sources/cpu.py create mode 100644 src/rigdoctor/core/sources/memory.py create mode 100644 src/rigdoctor/core/sources/nvidia.py create mode 100644 src/rigdoctor/core/sources/storage.py create mode 100644 src/rigdoctor/render.py create mode 100644 tests/test_core.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0b5fef3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +# Python +__pycache__/ +*.py[cod] +.venv/ +venv/ + +# RigDoctor runtime output +logs/ +*.csv +*.jsonl +src/**/__pycache__/ + +# Editor / OS +.vscode/ +.idea/ +*.swp +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..85b93aa --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# RigDoctor + +A **modular diagnostics, monitoring, and health-check toolkit for Linux gamers.** + +> **Status:** ๐ŸŸข Phase 1 (MVP) in progress. Foundational decisions are settled and the +> **sensor core (M1)** works โ€” `snapshot` / `monitor` read NVIDIA GPU, CPU, memory, and +> NVMe live. Crash logger (M3) and health report (M4) are next. See `docs/ROADMAP.md`. + +## Why this exists + +Linux gaming hardware faults are hard to diagnose: GPUs falling off the PCIe bus, the screen +suddenly going black mid-game, silent thermal/VRAM throttling, power transients, +driver/library mismatches, Proton quirks, and CPU governor / power-profile misconfiguration. +The data needed to diagnose them is scattered across `nvidia-smi`, `/sys/class/hwmon`, +`journalctl`, SMART, and more โ€” and the most useful readings (the ones right before a hard +freeze) are usually lost because nothing flushed them to disk. + +RigDoctor pulls all of that into one modular tool: live monitoring, crash-safe logging, a +one-shot health report, and an interactive installer that only sets up the modules a given +user actually needs for their hardware. + +**Seed use cases:** an RTX 3070 that intermittently "falls off the bus" under heavy GPU load +(Path of Exile on Linux, Escape from Tarkov on Windows), and a monitor going black mid-game. +See `docs/SPEC.md` ยง1. + +## How you run it + +Three front-ends over one shared engine โ€” pick what fits: +- **CLI / headless** โ€” full functionality from the terminal, works over SSH. +- **Desktop GUI** โ€” graphical dashboard, log browser, and health-report viewer. +- **Tray applet** โ€” a small applet in the top menu bar with quick actions (e.g. start + recording) and at-a-glance status. + +The GUI and tray are optional modules; a headless install loses no diagnostic capability. + +## Key decisions (settled) + +| Topic | Decision | +|-------|----------| +| Name | **RigDoctor** | +| Language / stack | **Python 3 + Qt (PySide6)** โ€” core/CLI/daemon stdlib-only; Qt only for GUI/tray | +| Primary distro | **Ubuntu** (Debian via apt); others best-effort later | +| Primary GPU | **NVIDIA** first; AMD, then Intel later | +| MVP | **Sensor core + crash logger + health report** (NVIDIA-only, CLI-first) | +| Distribution | **`.deb`** + interactive module installer | +| Scope of action | **Read-only + suggestions** (no auto-apply yet) | +| Stress tests | **Out of scope** | + +Full rationale and the still-open questions are in `docs/DECISIONS.md`. + +## Repo layout + +| Path | Purpose | +|------|---------| +| `docs/SPEC.md` | Product specification โ€” vision, requirements, modules (the main planning doc) | +| `docs/ARCHITECTURE.md` | Technical design โ€” core engine, front-ends, daemon, installer | +| `docs/MODULES.md` | Catalog of modules with scope, dependencies, status | +| `docs/ROADMAP.md` | Phased milestones | +| `docs/DECISIONS.md` | Decision log + remaining open questions | +| `src/rigdoctor/` | Source code โ€” `core/` engine + sources, `cli.py`, `render.py` | +| `installer/` | Installer / `.deb` packaging (empty until Phase 4) | +| `tests/` | Tests (stdlib `unittest`) | + +## Run it (dev) + +Stdlib-only, no install needed (target is Python โ‰ฅ 3.11; tested on 3.14): + +```bash +PYTHONPATH=src python3 -m rigdoctor snapshot # one-shot sensor read +PYTHONPATH=src python3 -m rigdoctor snapshot --json +PYTHONPATH=src python3 -m rigdoctor monitor -n 1 # live view (Ctrl-C to quit) +PYTHONPATH=src python3 -m rigdoctor sources # list detected sensor sources +PYTHONPATH=src python3 -m unittest discover -s tests +``` + +Or `pip install -e .` to get a `rigdoctor` command on your PATH. + +## Start here + +1. Read `docs/SPEC.md` for what we're building. +2. Read `docs/ROADMAP.md` for the build order (Phase 1 = the MVP). +3. Read `docs/DECISIONS.md` for the settled decisions (D1โ€“D15). + diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..f453814 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,166 @@ +# RigDoctor โ€” Architecture (DRAFT v0.2) + +> Tech stack and key structural decisions are now settled (see `DECISIONS.md` D2, D6, D8, +> D10, D11). Items still marked **[OPEN]** are tracked there. + +## 1. Principles +- **Modular core + plugins.** A small engine; every capability is a module that can be + installed/omitted independently. +- **Capability detection over assumption.** Probe what hardware/tools exist; degrade + gracefully. +- **Vendor & distro abstraction.** GPU and package-manager differences live behind + interfaces, not scattered through the code (NVIDIA + apt are the first concrete impls). +- **One engine, many front-ends.** CLI, TUI, GUI, and tray are all thin front-ends over the + same core engine. Anything the GUI/tray can do is reachable headless from the CLI. + +## 2. Tech stack โ€” *DECIDED (D2)* +- **Language:** Python 3 (target machine has Python 3.14). +- **Core / CLI / daemon:** **stdlib only** โ€” no `pip` deps. Easy log/JSON/subprocess + handling, tiny footprint, runs headless/over SSH. +- **TUI (M2):** stdlib `curses` / plain ANSI redraw (no deps). +- **GUI (M10) + tray (M11):** **Qt via PySide6** โ€” one toolkit for both the desktop window + and the `QSystemTrayIcon` menu-bar applet. PySide6 is a dependency of *only* these two + modules, declared in the `.deb`; the core/daemon never import Qt. +- **Installer bootstrap (M9):** the `.deb`'s maintainer scripts ensure Python is present, + then hand off to the Python installer for module selection. + +## 3. Component layout +``` + +--------------------------+ + | core engine | (stdlib only) + | sources โ†’ sampler โ†’ bus | + +--------------------------+ + ^ ^ ^ ^ + +-------------------+ | | +--------------------+ + | +-----------+ +-----------+ | + +---------+ +----------+ +-----------+ +--------------+ + | CLI | | daemon | | GUI | | tray applet | + | (stdlib)| | (M3, | | (M10,Qt) | | (M11, Qt) | + | TUI(M2) | | systemd) | | | | | + +---------+ +----------+ +-----------+ +--------------+ +``` +- The **core engine** is a stdlib-only library: sources โ†’ sampler loop โ†’ an internal bus + that fans samples out to sinks (TUI renderer, CSV/JSON logger, alert engine, report + builder). +- The **daemon** (M3) is a long-running, stdlib-only process managed by `systemd --user`. +- The **GUI** and **tray** import PySide6 and talk to the same engine; for live status they + can read the daemon's output / a small status file or socket rather than re-sampling. + +## 4. Core engine +``` ++-------------------+ +------------------+ +-------------------+ +| Sources (probe) | ---> | Sampler loop | ---> | Sinks | +| nvidia-smi/NVML | | (interval, Hz) | | - TUI renderer | +| amdgpu sysfs | | normalizes into | | - CSV/JSON logger | +| hwmon/lm-sensors | | Sample records | | - Alert engine | +| journalctl/SMART | | | | - Report builder | ++-------------------+ +------------------+ | - GUI/tray feed | + +-------------------+ +``` +- **Sample record:** `{ ts, source, metric, value, unit }` flattened per tick into a row. +- **Sources** are pluggable; each declares which metrics it can provide and self-checks + availability at startup. NVIDIA (`nvidia-smi`/NVML) + hwmon are the first implementations. + +## 5. Module contract +Each module declares a manifest so the installer and engine can reason about it: +``` +module: + id: crash-logger + name: "Crash-capture logger" + provides: [logging] + requires_sources: [gpu, cpu_temp] # capabilities, not packages + system_packages: # per package manager, optional + apt: [] # uses nvidia-smi + sysfs only + pacman: [] + dnf: [] + python_deps: [] # e.g. GUI/tray modules โ†’ [pyside6] + optional_packages: + apt: [smartmontools] # enriches if present + gpu_vendors: [nvidia, amd, intel] + default_in_bundles: [essential] +``` +Lifecycle hooks a module may implement: `probe()`, `collect(sample)`, `render(view)`, +`report()`, `install_hint()`. GUI/tray modules additionally declare `python_deps: [pyside6]`. + +## 6. Crash-logger daemon & trigger model โ€” *DECIDED (D6)* +The logger (M3) runs as a `systemd --user` service. Three user-selectable trigger modes: +1. **Always-on** โ€” service enabled at login, samples continuously (bounded by rotation). +2. **Game-launch-triggered** โ€” starts when a game/Steam session begins, stops after. + Detection is layered (D12), no root: a precise **wrapper** (`rigdoctor wrap %command%` + + global Steam compat-tool) as primary; a zero-config **watcher** (Steam `RunningAppID` + + `/proc` heuristic) as fallback; **GameMode** D-Bus signals if `gamemoded` is present. +3. **Manual** โ€” started/stopped via the CLI (`rigdoctor record start/stop`) or the tray + applet's quick action. + +The selected mode is written to config by the installer and changeable later via CLI/GUI. + +## 7. GUI & tray โ€” *DECIDED (D10/D11)* +- **GUI (M10):** a PySide6 desktop app โ€” live dashboard (graphs/gauges), crash-log browser, + health-report viewer, inventory view, logger controls. Works under X11 and Wayland. +- **Tray (M11):** `QSystemTrayIcon` applet in the top menu bar (StatusNotifierItem; on + Ubuntu/GNOME surfaced via the AppIndicator extension). Dropdown shows live M1 readouts + (CPU temp, GPU temp, memory used/total, status dot) and actions led by **Run Diagnostic** + (the guided diagnostic session, ยง7.1), plus Open dashboard / Start-Stop recording / + Snapshot / Quit (D13). +- Both are **optional** โ€” a headless/server install omits them and loses no diagnostic + capability (everything is in the CLI). + +### 7.1 Guided diagnostic session (orchestration) +The "Run Diagnostic" flow (exposed in tray, GUI, and CLI) is not a new module โ€” it +orchestrates existing ones: **pick a game** (D12 detection: Steam library / recently played +/ running process) โ†’ **focused capture** (M3 scoped to that game's session via the D12 +wrapper/watcher) โ†’ **scan & analyze** (M4 over the captured window + system logs) โ†’ +**present prioritized findings** with suggested fixes (read-only, D9). The engine exposes it +as a single callable so all three front-ends share one implementation. + +## 8. Installer design (M9) +1. **Detect** GPU vendor via `lspci` (NVIDIA first) and the package manager (apt first). +2. **Present** a module menu grouped into bundles: + - *Essential* (sensor core + crash logger + health report) โ€” the MVP, NVIDIA-only. + - *Monitoring* (live TUI + alerts) + - *Diagnostics* (inventory + gaming-env checks + SMART) + - *Desktop UI* (GUI + tray applet โ€” adds the PySide6 dependency) + - *Custom* (pick individual modules) + For each selection, show the exact packages that will be installed. +3. **Resolve** dependencies: union of selected modules' `system_packages` + `python_deps` + for the detected package manager; report-only if a package is missing and sudo + unavailable. +4. **Install** (with explicit confirmation), **write config** (`~/.config/rigdoctor/`), + optionally **enable** the `systemd --user` logger service and choose its trigger mode (D6). +5. **Verify** each installed module's `probe()` and print a readiness summary. + +Module list/bundling is final (D14). Packaging is `.deb`-first (D8); the wizard layers +module selection on top of the package. + +## 9. GPU vendor abstraction +| Capability | NVIDIA (first) | AMD (later) | Intel (later) | +|------------|--------|-----|-------| +| Temps/clocks/power | `nvidia-smi`/NVML | `/sys/class/drm/.../device` + `rocm-smi` | `/sys` + `intel_gpu_top` | +| VRAM temp | mem-junction (often N/A on GeForce) | sysfs `mem` hwmon | n/a | +| Crash signature | Xid in dmesg | `amdgpu: GPU reset` / ring timeouts | i915 GPU hang | +| Power limit (read-only, D9) | `nvidia-smi -pl` (suggested, not applied) | sysfs `power_dpm` / `pp_*` | n/a | + +## 10. Data & config layout +``` +~/.config/rigdoctor/config.toml # enabled modules, thresholds, interval, trigger mode +~/.local/share/rigdoctor/logs/ # rotated crash logs (CSV/JSON) +~/.local/state/rigdoctor/ # session/min-max state, daemon status feed +``` + +## 11. Dependency package names โ€” apt-only (D15) +We maintain package names for **Ubuntu/apt only**; no cross-distro mapping is built or +maintained. The set is small (filled in per module as they land): + +| Logical dep | apt package | +|-------------|-------------| +| SMART | `smartmontools` | +| lm-sensors | `lm-sensors` | +| DMI/inventory | `dmidecode` | +| GUI/tray (Qt) | `python3-pyside6` | +| Tray on GNOME | `gir1.2-appindicator3-0.1` (AppIndicator) | +| Desktop notifications | `libnotify-bin` | + +Module manifests still declare deps under a `system_packages.apt` / `python_deps` key, so a +thin seam remains if another package manager is ever added โ€” but multi-distro support is **not +a planned deliverable** (D15). + diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md new file mode 100644 index 0000000..583deeb --- /dev/null +++ b/docs/DECISIONS.md @@ -0,0 +1,127 @@ +# RigDoctor โ€” Decisions & Open Questions + +Format: each item is **OPEN** (needs a call) or **DECIDED** (with date + rationale). +Decisions D1โ€“D15 were all settled on 2026-05-21; the original open questions are kept below +with their resolutions so the reasoning is traceable. No tracked decisions are currently open. + +## Decided + +### D1 โ€” Project name โ€” *DECIDED 2026-05-21* +**RigDoctor.** Confirmed as the final name (repo, package, and CLI command `rigdoctor`). +Alternatives (RigWatch, GameDoc, Penguin Pit Crew, LGD) dropped. + +### D2 โ€” Language / runtime โ€” *DECIDED 2026-05-21* +**Python 3 + Qt (PySide6).** +- *Why Python:* fastest AI-assisted development (largest codegen corpus) and a perfect fit + for the real workload โ€” parsing `nvidia-smi`/sysfs/`journalctl`, CSV/JSON, subprocess. +- *Why Qt/PySide6:* one toolkit covers **both** the desktop GUI and the system-tray applet. +- *Layering that preserves "low overhead":* the **core engine, CLI, and crash-logger daemon + stay stdlib-only** (no hard deps, tiny footprint); **only the GUI and tray modules pull in + PySide6**. This maps cleanly onto the modular installer โ€” a headless/server user never + installs Qt. +- *Trade-off accepted:* the GUI carries a Qt runtime dependency (not a single static binary). + Mitigated by shipping a `.deb` that declares `python3` + `python3-pyside6` (see D8). + +### D3 โ€” Distro priority order โ€” *DECIDED 2026-05-21* +**Ubuntu first**, by an explicit margin. Debian comes along for free via `apt`. Arch +(`pacman`) / Fedora (`dnf`) / openSUSE (`zypper`) are best-effort later. The package-manager +and distro abstraction stays in the design so other distros can be added, but all primary +development, testing, and packaging target Ubuntu. + +### D4 โ€” GPU vendor priority โ€” *DECIDED 2026-05-21* +**NVIDIA first.** It's the seed hardware (RTX 3070) and the source of the motivating crash. +AMD and Intel come later behind the vendor abstraction; nothing should hard-code NVIDIA in a +way that blocks them. + +### D5 โ€” MVP scope โ€” *DECIDED 2026-05-21* +**M1 + M3 + M4 (the *Essential* bundle), NVIDIA-only**, CLI-first. This is the first build +target โ€” it captures the seed crash and explains the logs before any installer, GUI, tray, +or multi-vendor work. + +### D6 โ€” Crash-logger trigger model โ€” *DECIDED 2026-05-21* +**Let the user choose.** All three modes are supported and selectable (installer + config): +1. **Always-on** `systemd --user` service. +2. **Game-launch-triggered** (auto-start when a game/Steam session starts, stop after). +3. **Manual** (CLI command, or the tray applet's "start recording" button). +*Still open:* the exact game-launch detection mechanism โ€” see D12. + +### D7 โ€” Stress / repro module โ€” *DECIDED 2026-05-21* +**Out of scope. Module M7 is dropped.** RigDoctor will not build or bundle stress/load +generators. Users who want to reproduce load can run existing tools (gpu-burn, vkmark, +stress-ng) themselves alongside the logger. + +### D8 โ€” Distribution / packaging โ€” *DECIDED 2026-05-21* +**`.deb` package** as the primary distribution channel (matches the Ubuntu-first focus). The +`.deb` declares dependencies per module group; the interactive installer (M9) handles module +selection on top. AUR / Flatpak / COPR are possible later, not now. + +### D9 โ€” Scope of action (read-only vs apply-fixes) โ€” *DECIDED 2026-05-21* +**Read-only + suggestions.** RigDoctor diagnoses, monitors, and **suggests** actions in +plain language (with the exact command where possible), but does **not** apply changes +itself in this stage. Auto-applying fixes (governor, power profile, etc.) is a deliberate +later milestone, gated behind explicit user consent when it lands. + +### D10 โ€” GUI is a first-class deliverable โ€” *DECIDED 2026-05-21* +The app must run **three ways**: (a) **CLI-only / headless** (full functionality from the +terminal, works over SSH), (b) a **desktop GUI**, and (c) a **system-tray / top-menu-bar +applet** with quick actions. This supersedes the original "terminal-first, GUI maybe later" +non-goal. GUI and tray are separate optional modules over the shared core engine. + +### D11 โ€” Tray / menu-bar applet โ€” *DECIDED 2026-05-21* +A small always-available applet in the Linux top menu bar (system tray / StatusNotifierItem, +via Qt's `QSystemTrayIcon`; on Ubuntu/GNOME this surfaces through the AppIndicator +extension). Provides quick actions and at-a-glance status. +*Still open:* the exact set of quick actions/indicators โ€” see D13. + +### D12 โ€” Game-launch detection mechanism โ€” *DECIDED 2026-05-21* +**Layered approach, no root** (logger stays a `systemd --user` service): +1. **Wrapper (precise, primary):** `rigdoctor wrap %command%` for per-game Steam launch + options, plus an installer helper that registers RigDoctor as a **global Steam + compatibility tool** (covers all Proton games without per-game edits). The same wrapper + field works in Lutris/Heroic. Deterministic start/stop, knows the title, needs no + watcher daemon. *Build first.* +2. **Zero-config watcher (fallback):** low-frequency poll of Steam's `RunningAppID` + (`~/.steam/registry.vdf`) plus a `/proc` heuristic for non-Steam launchers, for users + who won't edit launch options. *Build later.* +3. **GameMode (opportunistic):** if Feral `gamemoded` is present, use its D-Bus + `GameRegistered`/`GameUnregistered` signals (via `gdbus`/`busctl` โ€” no Python dbus dep). +- *Explicitly rejected:* root-only kernel mechanisms (proc-connector netlink `PROC_EVENTS`, + eBPF) โ€” they'd force the logger to run as root. +- *Phasing:* wrapper ships with the game-launch trigger mode (Phase 4); watcher + GameMode + follow. + +### D13 โ€” Tray / menu-bar applet: actions & indicators โ€” *DECIDED 2026-05-21* +**Live readouts (from M1) + a Run Diagnostic action.** +- **At-a-glance live data** shown inline in the tray dropdown, refreshed periodically: + **CPU temp, GPU temp, memory used/total** (e.g. "14 GB / 32 GB"). A status dot + (normal / throttling / alert) is proposed alongside. +- **Run Diagnostic** โ€” the primary action. Launches the **guided diagnostic session** + (SPEC ยง4): prompts *which game to focus on*, starts a focused log collection for that + game's session (M3, scoped via the D12 game detection), then scans/analyzes (M4) and + presents the findings. +- **Supporting actions (proposed minimal set):** Open dashboard (M10), Start/Stop recording + (manual trigger), Snapshot now, Quit. + +### D14 โ€” Final installer module list & bundles โ€” *DECIDED 2026-05-21* +**Use the current `MODULES.md` catalog and bundles as final.** Modules: M1, M2, M3, M4, M5, +M6, M8, M9, M10, M11 (M7 dropped). Bundles: Essential / Monitoring / Diagnostics / +Desktop UI (+ Custom). No further additions planned for v1. + +### D15 โ€” Distro package-name mapping โ†’ apt-only โ€” *DECIDED 2026-05-21* +*What it was:* RigDoctor's optional modules need a few system packages (smartmontools, +lm-sensors, dmidecode, python3-pyside6, AppIndicator). The same tool is named differently +per distro (e.g. `lm-sensors` on apt vs `lm_sensors` on pacman/dnf; Qt is `python3-pyside6` +on apt). Supporting multiple distros would require a table mapping each logical dependency to +the right package name per package manager. +*Decision:* **apt-only.** We maintain package names for **Ubuntu/apt only** and do **not** +build or maintain mappings for other package managers. A thin seam is left in the design so +another package manager *could* be added later, but multi-distro support is **not** a planned +deliverable. Revisit only if Ubuntu-only proves too narrow. + +## Open + +None currently โ€” all tracked decisions (D1โ€“D15) are resolved. New questions will be added +here as they arise. Remaining detail to flesh out during build: the tray's supporting-action +set (D13 proposed list) and per-module apt package names (filled in as modules land). + + diff --git a/docs/MODULES.md b/docs/MODULES.md new file mode 100644 index 0000000..430eede --- /dev/null +++ b/docs/MODULES.md @@ -0,0 +1,56 @@ +# RigDoctor โ€” Module Catalog (DRAFT v0.2) + +Status: โฌœ not started ยท ๐ŸŸฆ designing ยท ๐ŸŸจ in progress ยท โœ… done + +> Final module set (D14). **M7 (stress/repro) was dropped (D7).** M10/M11 are the GUI and +> tray modules (D10/D11). GPU scope reads "all (NVIDIA first)" โ€” NVIDIA is implemented first, +> others via the vendor abstraction (D4). + +| ID | Module | Bundle | Key deps | GPU scope | Priority | Status | +|----|--------|--------|----------|-----------|----------|--------| +| M1 | Sensor core | Essential | none (nvidia-smi, sysfs) | all (NVIDIA first) | P0 | โฌœ | +| M3 | Crash-capture logger | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | โฌœ | +| M4 | Health report (log scan) | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | โฌœ | +| M2 | Live monitor (TUI) | Monitoring | none (stdlib curses) | all | P1 | โฌœ | +| M8 | Alerting | Monitoring | libnotify (opt) | all | P2 | โฌœ | +| M5 | System inventory | Diagnostics | none (opt: lm-sensors, dmidecode) | all | P1 | โฌœ | +| M6 | Gaming env checks | Diagnostics | none | all | P2 | โฌœ | +| M10 | Desktop GUI | Desktop UI | **python3-pyside6** | all | P2 | โฌœ | +| M11 | Tray / menu-bar applet | Desktop UI | **python3-pyside6** (+ AppIndicator on GNOME) | all | P2 | โฌœ | +| M9 | Installer | (meta) | none | all | P1 | โฌœ | +| ~~M7~~ | ~~Stress / repro~~ | โ€” | โ€” | โ€” | โ€” | โŒ dropped (D7) | + +## Notes per module +- **M1 Sensor core** โ€” the foundation everything else samples from. Stdlib-only. Abstracts + NVIDIA/AMD/Intel + hwmon behind one interface; **ship the NVIDIA + hwmon path first**. +- **M3 Crash-capture logger** โ€” the highest-value piece for the seed use case. `fsync` per + sample; GPU-lost detection via query timeout; bounded rotation; `systemd --user` service + with a **user-selectable trigger mode** (always-on / game-launch / manual โ€” D6). +- **M4 Health report** โ€” turns scattered logs into a prioritized, plain-language findings + list with **suggested** fixes (read-only, D9). Reuses M1 for a live snapshot. Also powers + the **guided diagnostic session** (with M3): pick a game โ†’ focused capture โ†’ scan โ†’ + findings (see SPEC ยง4). +- **M2 Live monitor** โ€” depends on M1; the terminal "HWMonitor for Linux" face. Stdlib-only. +- **M5 / M6 Diagnostics** โ€” inventory export + gaming-env checks; M6 flags risky settings and + suggests the fix command but does not apply it (D9). +- **M8 Alerting** โ€” threshold/event notifications; integrates with the tray applet (M11). +- **M10 Desktop GUI** โ€” PySide6 graphical front-end over the core engine (dashboard, log + browser, report viewer, logger controls). Optional; adds the Qt dependency. +- **M11 Tray applet** โ€” `QSystemTrayIcon` menu-bar applet. Dropdown shows live M1 readouts + (CPU temp, GPU temp, memory used/total, status dot) and is led by a **Run Diagnostic** + action (the guided diagnostic session), plus Open dashboard / Start-Stop recording / + Snapshot / Quit (D13). Optional; shares the Qt dependency with M10. +- **M9 Installer** โ€” interactive wizard layered on the `.deb` (D8); apt-first dependency + resolution; enables the logger service and trigger mode. + +## Bundles (final โ€” D14) +- **Essential:** M1 + M3 + M4 *(the MVP, NVIDIA-only โ€” D5)* +- **Monitoring:** M2 + M8 +- **Diagnostics:** M5 + M6 +- **Desktop UI:** M10 + M11 *(adds PySide6)* + +## MVP candidate โ€” *confirmed (D5)* +**M1 + M3 + M4 (Essential), NVIDIA-only, CLI-first.** Gives a working tool that captures the +GPU crash and explains the logs โ€” deliverable before the installer, GUI/tray, or multi-vendor +work. + diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md new file mode 100644 index 0000000..637599f --- /dev/null +++ b/docs/ROADMAP.md @@ -0,0 +1,56 @@ +# RigDoctor โ€” Roadmap (DRAFT v0.2) + +Phased so the seed use case (capturing the RTX 3070 crash / black-screen events) is solved +early, before the broader "tool for all Linux gamers" work. Stack: Python 3 + Qt/PySide6; +Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`). + +## Phase 0 โ€” Workspace & spec *(done)* +- [x] Create repo + docs scaffold +- [x] Settle the foundational decisions D1โ€“D11 (name, language, platform/GPU priority, MVP + scope, trigger model, packaging, scope-of-action, GUI/tray) +- [x] Lock the MVP scope (M1 + M3 + M4, NVIDIA-only) + +## Phase 1 โ€” MVP: capture *this* crash (Essential bundle, NVIDIA-only, CLI) +- [ ] M1 sensor core (NVIDIA via nvidia-smi + hwmon for CPU/RAM/NVMe), stdlib-only +- [ ] M3 crash-capture logger (CSV, fsync per sample, GPU-lost detection, rotation, + `systemd --user` service) +- [ ] Manual trigger mode first (`rigdoctor record start/stop`); other modes in Phase 4 +- [ ] M4 health report (Xid/panic/OOM/MCE/AER/thermal scan + driver-mismatch + snapshot, + suggested fixes only โ€” D9) +- [ ] `--report` post-crash summary (max temps/power, throttle events, last N samples) +- **Exit criteria:** user can run it during gaming and, after a freeze/black-screen, see the + last readings + a plausible cause. + +## Phase 2 โ€” Live monitor (terminal) +- [ ] M2 TUI dashboard (current/min/max, grouped, throttle highlighting) +- [ ] M8 basic alerting (overheat/throttle/GPU-lost notifications) + +## Phase 3 โ€” Diagnostics breadth +- [ ] M5 system inventory + exportable report +- [ ] M6 gaming environment checks (suggest-only) +- [ ] SMART integration (smartmontools if present) + +## Phase 4 โ€” Desktop UI & installer +- [ ] M10 desktop GUI (PySide6: dashboard, log browser, report viewer, logger controls) +- [ ] M11 tray / menu-bar applet (QSystemTrayIcon: live M1 readouts + Run Diagnostic + + supporting actions โ€” D13) +- [ ] Guided diagnostic session (pick game โ†’ focused M3 capture โ†’ M4 scan โ†’ findings), + shared by tray/GUI/CLI +- [ ] Logger trigger modes: always-on + game-launch (D12 โ€” wrapper first: + `rigdoctor wrap %command%` + global Steam compat-tool; zero-config watcher + (Steam RunningAppID + /proc) and GameMode hook follow) +- [ ] M9 interactive installer (GPU detection, module menu, apt dependency resolution, + service enable + trigger-mode pick) +- [ ] `.deb` packaging (D8) declaring per-bundle deps incl. python3-pyside6 for Desktop UI + +## Phase 5 โ€” Breadth (later) +- [ ] AMD GPU support in M1 (Steam Deck / Radeon) +- [ ] Intel GPU best-effort +- [ ] (Later, separate milestone) Optional auto-apply of suggested fixes behind explicit + consent โ€” currently out of scope (D9) + +> **Out of scope:** stress/repro module (D7); multi-distro support and packaging beyond +> Ubuntu/apt + `.deb` (D15) โ€” a thin seam is kept but not built out. + +> **Dropped:** stress / repro module (D7) โ€” not on the roadmap. + diff --git a/docs/SPEC.md b/docs/SPEC.md new file mode 100644 index 0000000..ed9371f --- /dev/null +++ b/docs/SPEC.md @@ -0,0 +1,156 @@ +# RigDoctor โ€” Product Specification (DRAFT v0.2) + +> Living spec. The foundational decisions (name, language, platform/GPU priority, MVP scope, +> packaging, scope-of-action, GUI/tray) are now settled โ€” see `DECISIONS.md` (D1โ€“D11). +> Anything still marked **[OPEN]** is tracked there (D12โ€“D15). + +## 1. Vision + +A single, modular toolkit that lets a Linux gamer **monitor**, **diagnose**, and +**understand the health** of their machine โ€” especially the hard-to-catch faults that happen +under gaming load. The goal is to make otherwise near-impossible-to-investigate problems +(random freezes, the screen suddenly going black mid-game, GPU "lost" events) tractable by +capturing the right data automatically and explaining it in plain language. Users install +only the modules relevant to their hardware via an interactive installer. + +**Motivating cases:** +- An RTX 3070 intermittently falls off the PCIe bus under heavy GPU/VRAM load + (`Xid 79` / `Xid 154`, `NV_ERR_GPU_IS_LOST`). The crash is OS-independent (also seen on + Windows in Tarkov) and load-correlated, pointing at hardware (VRAM thermals / power + transients / PCIe signal integrity). +- A monitor going black mid-session (e.g. during Path of Exile) โ€” is it the GPU dropping, + a driver reset, a cable/DP link issue, or a power event? Manually impossible to tell after + the fact. + +In both cases the last sensor readings before the freeze are normally never captured. +RigDoctor's crash-safe logger is designed to fix exactly that. + +## 2. Goals / Non-goals + +**Goals** +- Catch and preserve the machine's state in the seconds before a hard freeze. +- Make hard-to-investigate gaming faults debuggable: collect scattered signals, correlate + them, and explain them. +- Offer **three ways to run**: full **CLI / headless** (works over SSH), a **desktop GUI**, + and a **system-tray / top-menu-bar applet** with quick actions. (D10/D11) +- Be modular: a novice installs a one-click "monitor + capture + report" bundle; a power + user installs everything including the GUI, tray, and diagnostics. +- Low overhead; safe defaults; no telemetry/phone-home. + +**Non-goals (for now)** +- Not a benchmark-score / e-peen leaderboard tool. +- **Not a stress-test / load-generator** โ€” explicitly out of scope (D7). Users can run + existing tools (gpu-burn, vkmark, stress-ng) alongside the logger if they want. +- Not an overclocking utility. +- **Not (yet) an auto-fixer.** RigDoctor is **read-only**: it diagnoses and *suggests* + actions (with the exact command where possible) but does not apply changes itself in this + stage. Auto-apply is a deliberate later milestone behind explicit consent. (D9) + +## 3. Target users & platforms + +- **Users:** Linux gamers from novice ("is my PC ok?" + alerts, via GUI/tray) to advanced + (raw logs, log forensics, headless capture over SSH). +- **Distros:** **Ubuntu first** (and Debian via `apt`). Arch (`pacman`) / Fedora (`dnf`) / + openSUSE (`zypper`) best-effort later, behind the distro abstraction. (D3) +- **GPUs:** **NVIDIA first** (seed hardware). AMD second, Intel third โ€” behind the vendor + abstraction. (D4) +- **Display:** GUI and tray must work under both X11 and Wayland on Ubuntu/GNOME; **all core + functionality must also work fully headless** (CLI, over SSH, no display). +- **Runtime:** Python 3 + Qt (PySide6). Core/CLI/daemon are stdlib-only; GUI and tray add + PySide6. (D2) + +## 4. Functional requirements (by module) + +> Module IDs are stable. **M7 (stress/repro) is dropped** (D7). M10/M11 are the new GUI and +> tray modules. + +### M1 โ€” Sensor core (foundation, always installed) +Unified sampling of: CPU temp/freq/load, per-core; GPU temp/(mem-junction if exposed)/ +clocks/power/util/fan/VRAM/PCIe gen+width/throttle reasons; RAM (DDR5 SPD) temps; NVMe/SSD +temps; system load. Pluggable sources: `nvidia-smi`/NVML (first), `amdgpu` sysfs/`rocm-smi` +(later), `/sys/class/hwmon`, `lm-sensors`. Stdlib-only. + +### M2 โ€” Live monitor (TUI) +HWMonitor-style terminal dashboard: current / session-min / session-max per sensor, grouped +by subsystem, with throttle/critical highlighting. Refresh rate configurable. The terminal +face of the live data (the GUI in M10 presents the same data graphically). + +### M3 โ€” Crash-capture logger (daemon) +Headless background sampler that writes CSV/JSON and **`fsync`s every sample** so the last +readings survive a hard lock. Detects GPU "lost"/hang (query timeout) and writes a marker. +Ring-buffer/rotation to bound disk use. Runs as a `systemd --user` service. **Trigger model +is user-selectable** (D6): always-on, game-launch-triggered, or manual (CLI / tray button). +Stdlib-only. + +### M4 โ€” Health report (one-shot) +Scans `journalctl` for Xid, kernel panics, OOM-killer, MCE, PCIe AER, thermal events; checks +SMART disk health; flags driver/library version mismatches; verifies GPU firmware; prints a +prioritized findings list with plain-language explanations and **suggested** fixes (read-only +per D9). Reuses M1 for a live snapshot. + +### M5 โ€” System inventory +CPU/GPU/motherboard/BIOS/RAM/storage, kernel, driver versions, X11/Wayland + compositor, +PCIe topology. Exportable (Markdown/JSON) to paste into forum/bug reports. + +### M6 โ€” Gaming environment checks +Detects & evaluates: GPU power profile / persistence mode, CPU governor, Proton/Wine/Steam +versions, GameMode, MangoHud, shader cache, swappiness, hugepages, CPU mitigations, +PCIe ASPM. Flags settings that hurt stability/performance and **suggests** the fix command +(read-only per D9). + +### M8 โ€” Alerting +Threshold + event alerts (desktop notification / sound / log) on overheat, throttle, +GPU-lost, SMART failure. Surfaces in the tray applet (M11) when installed. Optional. + +### M10 โ€” Desktop GUI (PySide6/Qt) +Full graphical front-end over the core engine: live dashboard (graphs/gauges), browse and +visualize captured crash logs, run a health report and view findings, view system inventory, +toggle the logger and its trigger mode. Mirrors CLI capability for non-terminal users. +Optional module (pulls in PySide6). + +### M11 โ€” System-tray / menu-bar applet (PySide6/Qt) +A small always-available applet in the Linux top menu bar (system tray / +StatusNotifierItem; on Ubuntu/GNOME via the AppIndicator extension). Optional module. +Contents (D13): +- **At-a-glance live readouts (from M1)** in the dropdown, refreshed periodically: + **CPU temp, GPU temp, memory used/total** (e.g. "14 GB / 32 GB"); a status dot + (normal / throttling / alert) alongside. +- **Run Diagnostic** โ€” the headline action; launches the *guided diagnostic session* below. +- **Supporting actions:** Open dashboard (M10), Start/Stop recording, Snapshot now, Quit. + +### Guided diagnostic session (M3 + M4 workflow) +The "Run Diagnostic" flow available from the tray (M11), the GUI (M10), and the CLI: +1. **Pick a game to focus on** โ€” chosen from detected/installed games (via the D12 game + detection: Steam library / recently played / running process). +2. **Collect** โ€” RigDoctor runs a focused crash-capture session (M3) scoped to that game: + it logs while you play, bracketing the session via the D12 wrapper/watcher. +3. **Scan & analyze** โ€” when the session ends (or after a crash + reboot), it runs the + health report (M4) over the captured window + system logs to surface likely issues. +4. **Present findings** โ€” a prioritized, plain-language list with suggested fixes + (read-only, D9). +This is the one-click expression of the seed use case; it orchestrates existing modules +rather than adding a new one. + +### M9 โ€” Installer (see ARCHITECTURE ยง5) +Interactive wizard: detect GPU vendor (NVIDIA-first) โ†’ present module menu grouped into +bundles with descriptions and the exact packages each needs โ†’ resolve & install (apt first) +โ†’ write config โ†’ optionally enable the `systemd --user` logger service and pick its trigger +mode. Delivered alongside the `.deb` (D8). Module list/bundling is final per D14. + +## 5. Non-functional requirements +- **Zero hard deps for the core/CLI/daemon** โ€” Python stdlib + tools already present. **Qt + (PySide6) is required only by the GUI (M10) and tray (M11) modules**, declared in the + `.deb` and pulled in only when those modules are selected. +- **Crash-safe logging** โ€” flush + `fsync` per sample; bounded disk usage. +- **Low overhead** โ€” default โ‰ค1 Hz sampling; negligible CPU/GPU cost. The always-on daemon + is stdlib-only (no Qt loaded) so it stays tiny. +- **Headless-equivalent** โ€” every diagnostic capability is reachable from the CLI; the GUI + and tray are conveniences over the same engine, never the only way to do something. +- **Privacy** โ€” local only; inventory export is opt-in and reviewable; no telemetry. +- **Portability** โ€” graceful degradation when a sensor/tool is unavailable (N/A, not crash). + +## 6. Open questions +None tracked โ€” all foundational decisions (D1โ€“D15) are settled; see `DECISIONS.md`. Detail +to flesh out during build: the tray's supporting-action set and per-module apt package names. +Packaging/deps are **Ubuntu/apt-only** (D15) โ€” no multi-distro mapping is maintained. + diff --git a/installer/.gitkeep b/installer/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8be7c81 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "rigdoctor" +version = "0.1.0" +description = "Modular hardware monitoring & crash diagnostics for Linux gamers." +readme = "README.md" +requires-python = ">=3.11" +# Core/CLI/daemon are stdlib-only by design (D2). The GUI/tray modules will add +# PySide6 via an optional extra when those land. +dependencies = [] + +[project.optional-dependencies] +gui = ["PySide6"] + +[project.scripts] +rigdoctor = "rigdoctor.cli:main" + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py new file mode 100644 index 0000000..460e6ed --- /dev/null +++ b/src/rigdoctor/__init__.py @@ -0,0 +1,3 @@ +"""RigDoctor โ€” modular hardware monitoring & crash diagnostics for Linux gamers.""" + +__version__ = "0.1.0" diff --git a/src/rigdoctor/__main__.py b/src/rigdoctor/__main__.py new file mode 100644 index 0000000..f8760ba --- /dev/null +++ b/src/rigdoctor/__main__.py @@ -0,0 +1,7 @@ +"""Allow `python -m rigdoctor`.""" + +import sys + +from .cli import main + +sys.exit(main()) diff --git a/src/rigdoctor/cli.py b/src/rigdoctor/cli.py new file mode 100644 index 0000000..03d13f6 --- /dev/null +++ b/src/rigdoctor/cli.py @@ -0,0 +1,93 @@ +"""RigDoctor command-line interface.""" + +from __future__ import annotations + +import argparse +import json +import sys + +from . import __version__ +from .config import load_config +from .core.sampler import Sampler +from .core.sources import available_sources +from .render import render_snapshot + + +def _sampler() -> Sampler: + return Sampler(available_sources()) + + +def cmd_sources(args) -> int: + srcs = available_sources() + if not srcs: + print("No sensor sources detected.") + return 1 + print("Detected sources:") + for s in srcs: + print(f" - {s.name} ({type(s).__name__})") + return 0 + + +def cmd_snapshot(args) -> int: + sample = _sampler().sample() + if args.json: + payload = {"ts": sample.ts, "readings": sample.to_rows()} + print(json.dumps(payload, indent=2, ensure_ascii=False)) + else: + print(render_snapshot(sample)) + return 0 + + +def cmd_monitor(args) -> int: + interval = args.interval or load_config()["interval"] + try: + for sample in _sampler().stream(interval=interval): + # Basic full-screen redraw; the rich TUI (M2) comes later. + print("\033[2J\033[H", end="") + print(f"RigDoctor โ€” live (every {interval:g}s, Ctrl-C to quit)\n") + print(render_snapshot(sample)) + sys.stdout.flush() + except KeyboardInterrupt: + print() + return 0 + + +def cmd_record(args) -> int: + print("`record` (M3 crash-capture logger) is not implemented yet โ€” next on the roadmap.") + return 2 + + +def cmd_report(args) -> int: + print("`report` (M4 health report) is not implemented yet โ€” next on the roadmap.") + return 2 + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="rigdoctor", + description="Hardware monitoring & crash diagnostics for Linux gamers.", + ) + p.add_argument("-V", "--version", action="version", version=f"rigdoctor {__version__}") + sub = p.add_subparsers(dest="command", required=True) + + sp = sub.add_parser("snapshot", help="print a one-shot reading of all sensors") + sp.add_argument("--json", action="store_true", help="output JSON instead of text") + sp.set_defaults(func=cmd_snapshot) + + mp = sub.add_parser("monitor", help="live-refreshing sensor view") + mp.add_argument("-n", "--interval", type=float, default=None, help="refresh interval (s)") + mp.set_defaults(func=cmd_monitor) + + sub.add_parser("sources", help="list detected sensor sources").set_defaults(func=cmd_sources) + sub.add_parser("record", help="crash-capture logger (coming soon)").set_defaults(func=cmd_record) + sub.add_parser("report", help="health report (coming soon)").set_defaults(func=cmd_report) + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/rigdoctor/config.py b/src/rigdoctor/config.py new file mode 100644 index 0000000..23cc60d --- /dev/null +++ b/src/rigdoctor/config.py @@ -0,0 +1,38 @@ +"""Paths and configuration defaults (XDG layout, see ARCHITECTURE ยง10).""" + +from __future__ import annotations + +import os +from pathlib import Path + +APP = "rigdoctor" + + +def _xdg(env: str, default: str) -> Path: + base = os.environ.get(env) or str(Path.home() / default) + return Path(base) / APP + + +CONFIG_DIR = _xdg("XDG_CONFIG_HOME", ".config") +DATA_DIR = _xdg("XDG_DATA_HOME", ".local/share") +STATE_DIR = _xdg("XDG_STATE_HOME", ".local/state") +LOG_DIR = DATA_DIR / "logs" +CONFIG_FILE = CONFIG_DIR / "config.toml" + +DEFAULTS: dict = { + "interval": 1.0, # sampling interval in seconds (default โ‰ค1 Hz, low overhead โ€” NFR) +} + + +def load_config() -> dict: + """Return defaults merged with config.toml if present (best-effort).""" + cfg = dict(DEFAULTS) + try: + import tomllib + + if CONFIG_FILE.exists(): + with CONFIG_FILE.open("rb") as f: + cfg.update(tomllib.load(f)) + except Exception: + pass + return cfg diff --git a/src/rigdoctor/core/__init__.py b/src/rigdoctor/core/__init__.py new file mode 100644 index 0000000..305f757 --- /dev/null +++ b/src/rigdoctor/core/__init__.py @@ -0,0 +1 @@ +"""Core engine: sources โ†’ sampler โ†’ samples (stdlib-only).""" diff --git a/src/rigdoctor/core/hwmon.py b/src/rigdoctor/core/hwmon.py new file mode 100644 index 0000000..c72e163 --- /dev/null +++ b/src/rigdoctor/core/hwmon.py @@ -0,0 +1,41 @@ +"""Minimal /sys/class/hwmon reader (stdlib only).""" + +from __future__ import annotations + +from pathlib import Path + +HWMON_ROOT = Path("/sys/class/hwmon") + + +def _read(path: Path) -> str | None: + try: + return path.read_text().strip() + except OSError: + return None + + +def find_by_name(name: str) -> list[Path]: + """Return hwmon dirs whose `name` file equals `name` (e.g. 'coretemp').""" + matches: list[Path] = [] + if not HWMON_ROOT.exists(): + return matches + for d in sorted(HWMON_ROOT.glob("hwmon*")): + if _read(d / "name") == name: + matches.append(d) + return matches + + +def read_temps(hwmon_dir: Path) -> list[tuple[str, float]]: + """Return (label, ยฐC) for each tempN_input in a hwmon dir.""" + out: list[tuple[str, float]] = [] + for inp in sorted(hwmon_dir.glob("temp*_input")): + raw = _read(inp) + if raw is None: + continue + try: + celsius = int(raw) / 1000.0 + except ValueError: + continue + label = _read(inp.with_name(inp.name.replace("_input", "_label"))) + out.append((label or inp.name.replace("_input", ""), celsius)) + return out diff --git a/src/rigdoctor/core/sample.py b/src/rigdoctor/core/sample.py new file mode 100644 index 0000000..c390c9c --- /dev/null +++ b/src/rigdoctor/core/sample.py @@ -0,0 +1,45 @@ +"""Core data model: a Reading and a Sample (one tick across all sources).""" + +from __future__ import annotations + +import time +from dataclasses import asdict, dataclass, field + + +@dataclass(frozen=True) +class Reading: + """A single normalized sensor value. + + `value` is None when the metric is unavailable/N-A, so consumers can render + "N/A" rather than crash (graceful degradation โ€” NFR). + """ + + source: str # subsystem id: "gpu", "cpu", "memory", "storage" + metric: str # what it measures: "temp", "power", "util", ... + value: float | None # None == unavailable + unit: str = "" # "ยฐC", "W", "%", "MHz", "GB", "MiB", ... + label: str = "" # optional detail: core name, DIMM, device, "junction" + + @property + def key(self) -> str: + suffix = f".{self.label}" if self.label else "" + return f"{self.source}.{self.metric}{suffix}" + + +@dataclass +class Sample: + """All readings captured in one sampling tick.""" + + ts: float = field(default_factory=time.time) + readings: list[Reading] = field(default_factory=list) + + def by_source(self) -> dict[str, list[Reading]]: + """Group readings by subsystem, preserving insertion order.""" + groups: dict[str, list[Reading]] = {} + for r in self.readings: + groups.setdefault(r.source, []).append(r) + return groups + + def to_rows(self) -> list[dict]: + """Flatten to plain dicts for CSV/JSON logging.""" + return [{"ts": self.ts, **asdict(r)} for r in self.readings] diff --git a/src/rigdoctor/core/sampler.py b/src/rigdoctor/core/sampler.py new file mode 100644 index 0000000..e5e43eb --- /dev/null +++ b/src/rigdoctor/core/sampler.py @@ -0,0 +1,37 @@ +"""Sampling loop: poll all sources into Samples.""" + +from __future__ import annotations + +import time +from collections.abc import Iterator + +from .sample import Sample +from .sources.base import Source + + +class Sampler: + """Polls a set of sources, producing one Sample per tick.""" + + def __init__(self, sources: list[Source]): + self.sources = sources + + def sample(self) -> Sample: + s = Sample() + for src in self.sources: + try: + s.readings.extend(src.read()) + except Exception: + # A single misbehaving source must not abort the whole tick. + continue + return s + + def stream(self, interval: float = 1.0, count: int | None = None) -> Iterator[Sample]: + """Yield Samples every `interval` seconds (forever, or `count` times).""" + n = 0 + while count is None or n < count: + start = time.monotonic() + yield self.sample() + n += 1 + if count is not None and n >= count: + break + time.sleep(max(0.0, interval - (time.monotonic() - start))) diff --git a/src/rigdoctor/core/sources/__init__.py b/src/rigdoctor/core/sources/__init__.py new file mode 100644 index 0000000..5c66e72 --- /dev/null +++ b/src/rigdoctor/core/sources/__init__.py @@ -0,0 +1,31 @@ +"""Source discovery. GPU/NVIDIA first (D4), then CPU, memory, storage.""" + +from __future__ import annotations + +from .base import Source +from .cpu import CpuSource +from .memory import MemorySource +from .nvidia import NvidiaSource +from .storage import StorageSource + +# Display order: GPU first (the seed use case), then CPU, memory, storage. +ALL_SOURCE_TYPES: list[type[Source]] = [ + NvidiaSource, + CpuSource, + MemorySource, + StorageSource, +] + + +def available_sources() -> list[Source]: + """Instantiate and return the sources that probe successfully here.""" + found: list[Source] = [] + for cls in ALL_SOURCE_TYPES: + src = cls() + try: + if src.probe(): + found.append(src) + except Exception: + # A misbehaving probe must not hide the other sources. + continue + return found diff --git a/src/rigdoctor/core/sources/base.py b/src/rigdoctor/core/sources/base.py new file mode 100644 index 0000000..2ac2ce9 --- /dev/null +++ b/src/rigdoctor/core/sources/base.py @@ -0,0 +1,24 @@ +"""Source interface.""" + +from __future__ import annotations + +from ..sample import Reading + + +class Source: + """A pluggable sensor source for one subsystem. + + Subclasses set `name` and implement `probe()` and `read()`. Sources must never + raise on a missing sensor/tool โ€” return an empty list, or Readings with + value=None, so the rest of the system degrades gracefully (NFR). + """ + + name: str = "unknown" + + def probe(self) -> bool: + """Return True if this source can produce readings on this machine.""" + raise NotImplementedError + + def read(self) -> list[Reading]: + """Return current readings (entries may have value=None).""" + raise NotImplementedError diff --git a/src/rigdoctor/core/sources/cpu.py b/src/rigdoctor/core/sources/cpu.py new file mode 100644 index 0000000..ab6be90 --- /dev/null +++ b/src/rigdoctor/core/sources/cpu.py @@ -0,0 +1,32 @@ +"""CPU temperatures (coretemp/k10temp hwmon) + load average.""" + +from __future__ import annotations + +import os + +from ..hwmon import find_by_name, read_temps +from ..sample import Reading +from .base import Source + + +class CpuSource(Source): + name = "cpu" + + def _hwmons(self): + # Intel exposes 'coretemp'; AMD exposes 'k10temp'. + return find_by_name("coretemp") or find_by_name("k10temp") + + def probe(self) -> bool: + return bool(self._hwmons()) + + def read(self) -> list[Reading]: + readings: list[Reading] = [] + for d in self._hwmons(): + for label, celsius in read_temps(d): + readings.append(Reading("cpu", "temp", round(celsius, 1), "ยฐC", label)) + try: + load1 = os.getloadavg()[0] + readings.append(Reading("cpu", "load", round(load1, 2), "", "loadavg-1m")) + except (OSError, AttributeError): + pass + return readings diff --git a/src/rigdoctor/core/sources/memory.py b/src/rigdoctor/core/sources/memory.py new file mode 100644 index 0000000..752621c --- /dev/null +++ b/src/rigdoctor/core/sources/memory.py @@ -0,0 +1,48 @@ +"""System memory usage (/proc/meminfo) + DDR5 SPD temps (spd5118 hwmon).""" + +from __future__ import annotations + +from pathlib import Path + +from ..hwmon import find_by_name, read_temps +from ..sample import Reading +from .base import Source + +MEMINFO = Path("/proc/meminfo") +KB_PER_GB = 1024 * 1024 + + +def _meminfo() -> dict[str, int]: + data: dict[str, int] = {} + try: + for line in MEMINFO.read_text().splitlines(): + key, _, rest = line.partition(":") + data[key.strip()] = int(rest.strip().split()[0]) # kB + except (OSError, ValueError, IndexError): + pass + return data + + +class MemorySource(Source): + name = "memory" + + def probe(self) -> bool: + return MEMINFO.exists() + + def read(self) -> list[Reading]: + readings: list[Reading] = [] + info = _meminfo() + total = info.get("MemTotal") + avail = info.get("MemAvailable") + if total is not None: + readings.append(Reading("memory", "total", round(total / KB_PER_GB, 1), "GB")) + if avail is not None: + used = total - avail + readings.append(Reading("memory", "used", round(used / KB_PER_GB, 1), "GB")) + readings.append(Reading("memory", "available", round(avail / KB_PER_GB, 1), "GB")) + readings.append(Reading("memory", "used_pct", round(100 * used / total, 1), "%")) + # DDR5 module temperatures, if exposed by the SPD hub. + for i, d in enumerate(find_by_name("spd5118")): + for _, celsius in read_temps(d): + readings.append(Reading("memory", "temp", round(celsius, 1), "ยฐC", f"DIMM{i}")) + return readings diff --git a/src/rigdoctor/core/sources/nvidia.py b/src/rigdoctor/core/sources/nvidia.py new file mode 100644 index 0000000..21bf2e4 --- /dev/null +++ b/src/rigdoctor/core/sources/nvidia.py @@ -0,0 +1,93 @@ +"""NVIDIA GPU readings via nvidia-smi (NVML wrapper).""" + +from __future__ import annotations + +import shutil +import subprocess + +from ..sample import Reading +from .base import Source + +# Fields queried from nvidia-smi, in order. +_QUERY = [ + "name", + "temperature.gpu", + "temperature.memory", + "utilization.gpu", + "utilization.memory", + "power.draw", + "power.limit", + "clocks.current.graphics", + "clocks.current.memory", + "fan.speed", + "memory.used", + "memory.total", + "pcie.link.gen.current", + "pcie.link.width.current", +] + + +def _f(token: str) -> float | None: + token = token.strip() + if not token or token.startswith("[") or token.lower() in ("n/a", "not supported"): + return None + try: + return float(token) + except ValueError: + return None + + +class NvidiaSource(Source): + name = "gpu" + + def probe(self) -> bool: + if shutil.which("nvidia-smi") is None: + return False + try: + subprocess.run(["nvidia-smi", "-L"], capture_output=True, timeout=5, check=True) + return True + except (subprocess.SubprocessError, OSError): + return False + + def read(self) -> list[Reading]: + try: + proc = subprocess.run( + [ + "nvidia-smi", + f"--query-gpu={','.join(_QUERY)}", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + check=True, + ) + except subprocess.TimeoutExpired: + # A query timeout is itself a signal: the GPU may be hung/lost. + return [Reading("gpu", "status", None, "", "query-timeout")] + except (subprocess.SubprocessError, OSError): + return [] + + readings: list[Reading] = [] + for line in proc.stdout.strip().splitlines(): + cols = [c.strip() for c in line.split(",")] + if len(cols) != len(_QUERY): + continue + v = dict(zip(_QUERY, cols)) + readings += [ + Reading("gpu", "name", None, "", v["name"]), + Reading("gpu", "temp", _f(v["temperature.gpu"]), "ยฐC"), + Reading("gpu", "temp", _f(v["temperature.memory"]), "ยฐC", "memory"), + Reading("gpu", "util", _f(v["utilization.gpu"]), "%"), + Reading("gpu", "mem_util", _f(v["utilization.memory"]), "%"), + Reading("gpu", "power", _f(v["power.draw"]), "W"), + Reading("gpu", "power_limit", _f(v["power.limit"]), "W"), + Reading("gpu", "clock", _f(v["clocks.current.graphics"]), "MHz", "core"), + Reading("gpu", "clock", _f(v["clocks.current.memory"]), "MHz", "memory"), + Reading("gpu", "fan", _f(v["fan.speed"]), "%"), + Reading("gpu", "mem_used", _f(v["memory.used"]), "MiB"), + Reading("gpu", "mem_total", _f(v["memory.total"]), "MiB"), + Reading("gpu", "pcie_gen", _f(v["pcie.link.gen.current"]), "", "current"), + Reading("gpu", "pcie_width", _f(v["pcie.link.width.current"]), "x", "current"), + ] + return readings diff --git a/src/rigdoctor/core/sources/storage.py b/src/rigdoctor/core/sources/storage.py new file mode 100644 index 0000000..9972727 --- /dev/null +++ b/src/rigdoctor/core/sources/storage.py @@ -0,0 +1,34 @@ +"""NVMe / SSD temperatures via hwmon.""" + +from __future__ import annotations + +from pathlib import Path + +from ..hwmon import find_by_name, read_temps +from ..sample import Reading +from .base import Source + + +def _device_name(hwmon_dir: Path) -> str: + # /sys/class/hwmon/hwmonX/device -> .../nvme/nvme0 (best-effort label) + try: + return (hwmon_dir / "device").resolve().name + except OSError: + return hwmon_dir.name + + +class StorageSource(Source): + name = "storage" + + def probe(self) -> bool: + return bool(find_by_name("nvme")) + + def read(self) -> list[Reading]: + readings: list[Reading] = [] + for d in find_by_name("nvme"): + dev = _device_name(d) + for label, celsius in read_temps(d): + readings.append( + Reading("storage", "temp", round(celsius, 1), "ยฐC", f"{dev}:{label}") + ) + return readings diff --git a/src/rigdoctor/render.py b/src/rigdoctor/render.py new file mode 100644 index 0000000..77048fb --- /dev/null +++ b/src/rigdoctor/render.py @@ -0,0 +1,38 @@ +"""Human-readable rendering of a Sample for the terminal.""" + +from __future__ import annotations + +from .core.sample import Reading, Sample + +_GROUP_ORDER = ["gpu", "cpu", "memory", "storage"] +_GROUP_TITLES = {"gpu": "GPU", "cpu": "CPU", "memory": "Memory", "storage": "Storage"} + + +def _fmt_value(r: Reading) -> str: + if r.value is None: + return "N/A" + if r.unit == "ยฐC": + return f"{r.value:.1f} ยฐC" + if r.unit: + return f"{r.value:g} {r.unit}" + return f"{r.value:g}" + + +def _fmt(r: Reading) -> str: + if r.metric == "name": # GPU/device identity line + return f" {r.label}" + name = f"{r.metric} {r.label}".strip() + return f" {name:<22} {_fmt_value(r)}" + + +def render_snapshot(sample: Sample) -> str: + groups = sample.by_source() + ordered = [k for k in _GROUP_ORDER if k in groups] + ordered += [k for k in groups if k not in _GROUP_ORDER] + + blocks: list[str] = [] + for key in ordered: + title = _GROUP_TITLES.get(key, key.title()) + lines = [title] + [_fmt(r) for r in groups[key]] + blocks.append("\n".join(lines)) + return "\n\n".join(blocks) diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 0000000..a27b6fc --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,38 @@ +"""Smoke tests for the sensor core (stdlib unittest; no hardware assumptions).""" + +import unittest + +from rigdoctor.core import sources +from rigdoctor.core.sample import Reading, Sample +from rigdoctor.core.sampler import Sampler +from rigdoctor.render import render_snapshot + + +class CoreTests(unittest.TestCase): + def test_available_sources_returns_list(self): + self.assertIsInstance(sources.available_sources(), list) + + def test_sample_groups_and_rows(self): + sample = Sample( + ts=1.0, + readings=[ + Reading("gpu", "temp", 50.0, "ยฐC"), + Reading("cpu", "temp", 40.0, "ยฐC", "Package id 0"), + ], + ) + self.assertEqual(set(sample.by_source()), {"gpu", "cpu"}) + rows = sample.to_rows() + self.assertEqual(rows[0]["ts"], 1.0) + self.assertEqual(rows[0]["source"], "gpu") + + def test_reading_handles_none_value(self): + text = render_snapshot(Sample(readings=[Reading("gpu", "temp", None, "ยฐC", "memory")])) + self.assertIn("N/A", text) + + def test_sampler_runs(self): + sample = Sampler(sources.available_sources()).sample() + self.assertIsInstance(sample, Sample) + + +if __name__ == "__main__": + unittest.main()