Compare commits
44 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5996fbdc30 | |||
|
8f4824f576
|
|||
|
edc2166011
|
|||
|
31ecf67ca7
|
|||
| ac4863b0d4 | |||
| b65f36bb2d | |||
|
0f9cb4b684
|
|||
|
b9bfec961c
|
|||
|
b1bc961b79
|
|||
| 410f8882ee | |||
| 1da7816741 | |||
|
33c554c29f
|
|||
| 31178bace8 | |||
|
04e8d72bce
|
|||
| fb468e83c2 | |||
|
b006fa6b8d
|
|||
| b20e8dfc3a | |||
| 9fe9a6576f | |||
|
07bc722209
|
|||
| d405bf7caf | |||
|
9bb0f9a684
|
|||
| 4bbc0fa97e | |||
|
a0f8055328
|
|||
| 323451428b | |||
|
479189ee4e
|
|||
| 51133e4042 | |||
|
bcf6ac2656
|
|||
|
81c7757546
|
|||
| d59261f021 | |||
|
44923b771a
|
|||
| eaaf14c58a | |||
| 7779131cf9 | |||
|
87fa678ccb
|
|||
| c5e24b3984 | |||
|
21cc6a4813
|
|||
| ee73049248 | |||
|
3a8ad5bd5d
|
|||
| e8b84bf046 | |||
|
2342dd83aa
|
|||
| a028fe6d38 | |||
|
a6453335e9
|
|||
| baec47dd4e | |||
| 47ecb702e7 | |||
|
dc719f6a89
|
@@ -113,13 +113,19 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
if [ -z "${PKG_TOKEN:-}" ]; then
|
if [ -z "${PKG_TOKEN:-}" ]; then
|
||||||
echo "PACKAGES_TOKEN not set — skipping apt publish (the .deb is still a release asset)."
|
echo "REGISTRY_TOKEN not set — skipping apt publish (the .deb is still a release asset)."
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
OWNER="${{ github.repository_owner }}"
|
OWNER="${{ github.repository_owner }}"
|
||||||
URL="${{ github.server_url }}/api/packages/${OWNER}/debian/pool/stable/main/upload"
|
URL="${{ github.server_url }}/api/packages/${OWNER}/debian/pool/stable/main/upload"
|
||||||
for f in dist/*.deb; do
|
for f in dist/*.deb; do
|
||||||
echo "Uploading $(basename "$f") to the apt registry…"
|
echo "Uploading $(basename "$f") to the apt registry…"
|
||||||
curl -sS --fail --user "${OWNER}:${PKG_TOKEN}" --upload-file "$f" "$URL"
|
code=$(curl -sS -o /tmp/apt_upload.txt -w '%{http_code}' \
|
||||||
|
--user "${OWNER}:${PKG_TOKEN}" --upload-file "$f" "$URL" || true)
|
||||||
|
case "$code" in
|
||||||
|
2*) echo " uploaded ($code)";;
|
||||||
|
409) echo " already published ($code) — skipping (registry versions are immutable)";;
|
||||||
|
*) echo " upload failed ($code):"; cat /tmp/apt_upload.txt || true; exit 1;;
|
||||||
|
esac
|
||||||
done
|
done
|
||||||
echo "apt source: deb ${{ github.server_url }}/api/packages/${OWNER}/debian stable main"
|
echo "apt source: deb ${{ github.server_url }}/api/packages/${OWNER}/debian stable main"
|
||||||
|
|||||||
+131
@@ -5,6 +5,137 @@ All notable changes to RigDoctor are recorded here. Format follows
|
|||||||
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
(`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
|
||||||
release tag (so the auto-updater, D18, can compare versions).
|
release tag (so the auto-updater, D18, can compare versions).
|
||||||
|
|
||||||
|
## [0.43.0] - 2026-05-29
|
||||||
|
### Added
|
||||||
|
- **GPU stress test + close thermal monitoring** (`rigdoctor stress`, and a "Stress test…" button
|
||||||
|
on System Health). Runs a GPU load and samples sensors at a high rate (default 0.5 s), then
|
||||||
|
reports per-metric min/avg/**peak**, how long the core spent above each temperature threshold,
|
||||||
|
power vs the limit, throttling (decoded from the NVML clocks-event bitmask), and any GPU **fault**
|
||||||
|
(Xid / VA-space freeze / a query-timeout hang) that hit during the window — the on-demand way to
|
||||||
|
reproduce load-correlated crashes. The load comes from an explicit `--command` (a game or a tool
|
||||||
|
like gpu-burn), an auto-detected loader (gpu-burn/vkmark/glmark2/vkcube), or **monitor-only** when
|
||||||
|
none is found (you launch the game; it tracks temps while you play).
|
||||||
|
- **Drive health & wear in the health report.** A new `core/drives.py` parses the full
|
||||||
|
`smartctl --json` for every drive into prioritized findings: the SMART verdict, a derived
|
||||||
|
**life-left %** (NVMe `percentage_used` or the SATA wear-leveling attribute), **power-on hours**,
|
||||||
|
data written (TBW), temperature, and the early-failure predictors (reallocated / pending /
|
||||||
|
offline-uncorrectable sectors, NVMe media errors, low available spare). Replaces the old
|
||||||
|
pass/fail-only SMART check; flows through the same elevated path (GUI launch / `sudo rigdoctor
|
||||||
|
report`), degrading to per-drive "needs root" notes unprivileged.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- **GUI "Add game…" can now link a launcher.** The dialog only asked for a name, so a custom
|
||||||
|
game (e.g. SPT) couldn't be given its launch command or log folder from the app — those were
|
||||||
|
CLI-only, leaving it unlaunchable from the GUI. It's now a proper form: name + an optional
|
||||||
|
launch command/script (with a **Browse…** file picker) + an optional log folder (auto-detected
|
||||||
|
from the script's folder when left blank).
|
||||||
|
|
||||||
|
## [0.42.0] - 2026-05-29
|
||||||
|
### Added
|
||||||
|
- **Detect hard freezes that log no Xid.** The kernel-log scanner caught Xid codes, OOM, panic,
|
||||||
|
MCE, PCIe AER, thermal events, and amdgpu resets — but a crash that logs *no* Xid slipped
|
||||||
|
through. It now flags the NVIDIA open-kernel-module **VA-space mapping fault** (`gpu_vaspace.c`
|
||||||
|
/ `dmaAllocMapping` assertions, NVKMS GEM-allocation failures) — a driver-internal error that
|
||||||
|
can storm for minutes and end in a freeze without the GPU ever "falling off the bus" (distinct
|
||||||
|
from Xid 79). A new `check_nvidia_module()` notes when the open module (`nvidia-*-open`) is
|
||||||
|
loaded — the context behind these faults — and a new `ai_knowledge` entry lets the assistant
|
||||||
|
tell the no-Xid freeze apart from the Xid 79 hardware drop.
|
||||||
|
- **Add games no launcher reports (e.g. SPT).** A user-authored custom-games list
|
||||||
|
(`core/customgames.py`) shows alongside Steam/Lutris/Heroic in `rigdoctor games` and the GUI
|
||||||
|
("Add game…"), for standalone mod launchers (Single-Player Tarkov), itch.io downloads, or any
|
||||||
|
hand-installed game. Each entry can carry a launch command and a log directory:
|
||||||
|
`rigdoctor games add "SPT" --command .../tarkov.sh` (a sibling `logs/` is auto-detected),
|
||||||
|
`rigdoctor games play "SPT"` launches it under the crash-capture wrapper (tagged with the real
|
||||||
|
name, not the script's), and the diagnostic now tails the game's *own* logs — SPT's
|
||||||
|
server/launcher logs — alongside the kernel log so the analysis sees what the game logged
|
||||||
|
before the freeze.
|
||||||
|
|
||||||
|
## [0.41.0] - 2026-05-25
|
||||||
|
### Added
|
||||||
|
- **Import a crash dump (`.dmp`) and explain it with AI.** The **Games** page gains an
|
||||||
|
"Import crash dump…" button (shown once an AI provider is configured) that opens a Windows
|
||||||
|
minidump — the kind a Proton/Wine game writes when it hard-crashes — parses it, and hands the
|
||||||
|
result to the opt-in AI assistant (D24; cloud sends still ask first). A new stdlib
|
||||||
|
`core/minidump.py` reads the `MDMP` streams with `struct` (no new deps): the exception / crash
|
||||||
|
reason (e.g. access violation `0xC0000005`), the **faulting module** (which DLL the crash
|
||||||
|
address lands in — `nvwgf2umx.dll`, `d3d11.dll`, an anticheat, the game's own `.exe`…), OS/CPU,
|
||||||
|
and the loaded-module list. If `minidump_stackwalk` (Breakpad) or `minidump-stackwalk`
|
||||||
|
(rust-minidump) is on PATH, its fuller report is appended best-effort. The model is told the
|
||||||
|
dump came from a Windows process under Proton, so fixes stay Linux/Proton-side (Proton version,
|
||||||
|
DXVK/VKD3D, driver, launch options) — never Windows admin/registry steps. New `ai_knowledge`
|
||||||
|
facts cover the common exception codes and faulting-module signatures. CLI parity:
|
||||||
|
`rigdoctor ai dump <file>`.
|
||||||
|
|
||||||
|
## [0.40.0] - 2026-05-22
|
||||||
|
### Added
|
||||||
|
- **RAM speed / XMP-EXPO check.** Inventory now shows each module's configured speed and, when it's
|
||||||
|
below the rated speed, the rating (e.g. `4800 MT/s (rated 5600)`); **System Health** flags it
|
||||||
|
("RAM at 4800 MT/s (rated 5600 MT/s)") with the fix — enable XMP/EXPO in BIOS. With the profile
|
||||||
|
off, dmidecode only reports the JEDEC base, so the rated speed is read from both dmidecode and
|
||||||
|
the part number (matched against known DDR5 speed grades, so no false positives). Needs dmidecode
|
||||||
|
(root / launch elevation). Completes the "underperforming hardware" trio with PCIe gen + refresh.
|
||||||
|
|
||||||
|
## [0.39.0] - 2026-05-22
|
||||||
|
### Added
|
||||||
|
- **Displays in the Inventory.** A new `core/displays.py` lists each connected monitor with its
|
||||||
|
resolution and current/max refresh — e.g. `DP-1 · Samsung LC34G55T → 3440x1440 @ 165 Hz`. Reads
|
||||||
|
GNOME's Mutter `DisplayConfig` over D-Bus (works on X11 *and* Wayland), falling back to `xrandr`
|
||||||
|
on other X11 desktops.
|
||||||
|
- **System Health flags monitors below their max refresh.** If a monitor supports a higher refresh
|
||||||
|
at its current resolution (e.g. a 165 Hz panel set to 60 Hz — an easily-missed gaming setting),
|
||||||
|
Health reports it with the fix (raise it in Display settings). Max is computed at the *current*
|
||||||
|
resolution, so it never suggests dropping resolution.
|
||||||
|
|
||||||
|
## [0.38.0] - 2026-05-22
|
||||||
|
### Added
|
||||||
|
- **PCIe link in the Inventory.** Each NVMe drive now shows its negotiated PCIe link next to the
|
||||||
|
model — e.g. `Samsung SSD 980 PRO 1TB (931.5G) · PCIe Gen4 x4` — read from sysfs
|
||||||
|
(`current/max_link_speed` + width). If a drive negotiates below its capability (a slower M.2
|
||||||
|
slot, lane-sharing, or a downtrain) it's flagged: `PCIe Gen3 x4 (capable of Gen4 x4)`. So you
|
||||||
|
can confirm a Gen4 SSD is actually in a Gen4 slot. (SATA disks show no PCIe link.)
|
||||||
|
- **System Health flags downtrained NVMe links.** A new check warns when an NVMe drive negotiates
|
||||||
|
fewer PCIe lanes than it supports (almost always motherboard **lane-sharing** — a GPU/second
|
||||||
|
card or another M.2 stealing lanes) and notes speed-only reductions as info (a slower slot or
|
||||||
|
idle ASPM). The GPU is deliberately excluded — NVIDIA drops its PCIe gen/width at idle, so a
|
||||||
|
snapshot would false-alarm.
|
||||||
|
|
||||||
|
## [0.37.1] - 2026-05-22
|
||||||
|
### Fixed
|
||||||
|
- **`rigdoctor update` now uses the right method for how RigDoctor was installed.** It detects
|
||||||
|
apt (`.deb`), pip (venv/`.run`), or source installs (`updates.install_kind()`); only pip
|
||||||
|
installs self-update in place. An apt install no longer fails with "No module named pip" —
|
||||||
|
it (and the GUI Update button) shows `sudo apt update && sudo apt install --only-upgrade
|
||||||
|
rigdoctor`; a source checkout points to `git pull`.
|
||||||
|
|
||||||
|
## [0.37.0] - 2026-05-22
|
||||||
|
### Added
|
||||||
|
- **Version footer** — a footer across the bottom of the window shows `RigDoctor v<version>` in
|
||||||
|
the bottom-right (moved out of the sidebar).
|
||||||
|
### Fixed
|
||||||
|
- **Pages scroll when content doesn't fit, and the window is no longer pinned to the tallest
|
||||||
|
page's height.** Long pages (Settings, Tuning, …) get a scrollbar when too tall — so controls
|
||||||
|
like Uninstall are always reachable — and the window can now be resized smaller than the screen
|
||||||
|
(min height dropped from "taller than the screen" to ~600px). Pages that manage their own
|
||||||
|
scroll/fill (Dashboard, System Health, Inventory, Share) are unchanged.
|
||||||
|
|
||||||
|
## [0.36.1] - 2026-05-22
|
||||||
|
### Fixed
|
||||||
|
- `rigdoctor gui` printed the wrong fix when PySide6 is missing — it suggested the non-existent
|
||||||
|
`python3-pyside6` package. Now it names the real split modules
|
||||||
|
(`python3-pyside6.qt{widgets,gui,websockets,svg}` + `python3-pyte`).
|
||||||
|
|
||||||
|
## [0.36.0] - 2026-05-22
|
||||||
|
### Fixed
|
||||||
|
- **`.deb` now installs all dependencies automatically — no manual tool install.** The previous
|
||||||
|
`Recommends: python3-pyside6` named a package that doesn't exist on Debian/Ubuntu (PySide6 is
|
||||||
|
split per module), so apt silently skipped it and the GUI wouldn't start. Now it Recommends the
|
||||||
|
actual modules the GUI imports — `python3-pyside6.qt{widgets,gui,websockets,svg}` + `python3-pyte`.
|
||||||
|
### Changed
|
||||||
|
- **`apt install rigdoctor` sets up the whole toolset.** The `.deb` also Recommends the optional
|
||||||
|
diagnostic/gaming tools (smartmontools, lm-sensors, dmidecode, pciutils, libnotify-bin,
|
||||||
|
libsecret-tools, gamemode, mangohud) so they install by default — users never hand-install
|
||||||
|
tools. `cpupower` is a Suggests (kernel-tied); `--no-install-recommends` still gives CLI-only.
|
||||||
|
|
||||||
## [0.35.0] - 2026-05-22
|
## [0.35.0] - 2026-05-22
|
||||||
### Added
|
### Added
|
||||||
- **`.deb` package (M9 / D8)** — `packaging/make_deb.py` builds a `rigdoctor_<version>_all.deb`
|
- **`.deb` package (M9 / D8)** — `packaging/make_deb.py` builds a `rigdoctor_<version>_all.deb`
|
||||||
|
|||||||
@@ -1,152 +1,137 @@
|
|||||||
# RigDoctor
|
# RigDoctor
|
||||||
|
|
||||||
A **modular diagnostics, monitoring, and health-check toolkit for Linux gamers.**
|
**Hardware monitoring & crash diagnostics for Linux gamers.** Live sensors, crash-safe
|
||||||
|
logging, plain-language health reports, per-game diagnostics, and optional AI explanations —
|
||||||
|
in a desktop app, a tray applet, or the terminal. Ubuntu/Debian + NVIDIA first.
|
||||||
|
|
||||||
> **Status:** 🟢 Phase 1 (MVP) complete. The **sensor core (M1)**, **crash-capture logger
|
Linux gaming faults are hard to pin down — GPUs falling off the PCIe bus, black screens
|
||||||
> (M3)**, and **health report (M4)** all work — live `snapshot`/`monitor`, crash-safe `record`
|
mid-game, silent thermal/VRAM throttling, driver/Proton mismatches. The useful data is
|
||||||
> with a post-crash report, and `report` to scan logs/SMART/driver for likely causes. A
|
scattered across `nvidia-smi`, `/sys`, `journalctl`, and SMART, and the readings right before a
|
||||||
> desktop GUI (M10) ties them together (dashboard, recording, health). See `docs/ROADMAP.md`.
|
freeze are usually lost. RigDoctor pulls it together and keeps the evidence.
|
||||||
|
|
||||||
## Why this exists
|
## Features
|
||||||
|
|
||||||
Linux gaming hardware faults are hard to diagnose: GPUs falling off the PCIe bus, the screen
|
- **Live monitoring** — a dark desktop **dashboard** (history graphs + per-subsystem cards), a
|
||||||
suddenly going black mid-game, silent thermal/VRAM throttling, power transients,
|
**tray applet** with at-a-glance status, and a terminal view (`rigdoctor monitor`).
|
||||||
driver/library mismatches, Proton quirks, and CPU governor / power-profile misconfiguration.
|
- **Crash-safe recording** — background logger that `fsync`s every sample, so the state right
|
||||||
The data needed to diagnose them is scattered across `nvidia-smi`, `/sys/class/hwmon`,
|
before a hard freeze survives. Manual, always-on, or auto-start when a game launches.
|
||||||
`journalctl`, SMART, and more — and the most useful readings (the ones right before a hard
|
- **Health report** — scans `journalctl`/SMART/driver for likely causes (Xid, OOM, disk
|
||||||
freeze) are usually lost because nothing flushed them to disk.
|
errors, throttling…) and explains them with suggested fixes.
|
||||||
|
- **Per-game diagnostics** — pick a game, capture while you play, get a focused report; hard
|
||||||
|
crashes are detected and analysed on next launch.
|
||||||
|
- **Gaming tune-ups** — flags risky settings (CPU governor, PCIe ASPM, persistence mode…) with
|
||||||
|
**one-click, reversible fixes**.
|
||||||
|
- **Proactive alerts** — desktop notifications on overheating and critical kernel events
|
||||||
|
(GPU-lost, Xid, out-of-memory, disk I/O).
|
||||||
|
- **AI explanations** *(optional, opt-in)* — explain a diagnostic in plain language with a
|
||||||
|
**local model (Ollama)** or **Claude**, or **import a Windows crash dump (`.dmp`)** from a
|
||||||
|
Proton game and have it parsed and analysed. Never automatic; only when you press the button.
|
||||||
|
- **Shareable reports** — zip a diagnostic (logs, inventory, AI transcript) to hand to someone,
|
||||||
|
or share a live **terminal session** for remote help.
|
||||||
|
- **Self-updating** — `apt upgrade`, or the in-app updater.
|
||||||
|
|
||||||
RigDoctor pulls all of that into one modular tool: live monitoring, crash-safe logging, a
|
## Screenshots
|
||||||
one-shot health report, and an interactive installer that only sets up the modules a given
|
|
||||||
user actually needs for their hardware.
|
|
||||||
|
|
||||||
**Seed use cases:** an RTX 3070 that intermittently "falls off the bus" under heavy GPU load
|
| Dashboard | Inventory |
|
||||||
(Path of Exile on Linux, Escape from Tarkov on Windows), and a monitor going black mid-game.
|
|---|---|
|
||||||
See `docs/SPEC.md` §1.
|
|  |  |
|
||||||
|
|
||||||
## How you run it
|
**Share** — a read-only or interactive terminal session over the relay, for remote help:
|
||||||
|
|
||||||
RigDoctor is **GUI-first** — the desktop app is the primary way in — but every feature is
|

|
||||||
also available headless:
|
|
||||||
- **Desktop GUI** — graphical dashboard, recording controls, log browser, reports. The
|
|
||||||
default interface for most users.
|
|
||||||
- **Tray applet** — a small top-menu-bar applet with quick actions and at-a-glance status.
|
|
||||||
- **CLI** — full functionality from the terminal; works over SSH and in scripts.
|
|
||||||
|
|
||||||
The GUI/tray are optional modules; a headless (CLI-only) install loses no capability.
|
## Install
|
||||||
|
|
||||||
## Key decisions (settled)
|
### Debian / Ubuntu — `.deb`
|
||||||
|
|
||||||
| Topic | Decision |
|
The simplest path: grab the latest **`rigdoctor_<version>_all.deb`** from the
|
||||||
|-------|----------|
|
[releases page](https://git.jesseyvanofferen.com/jessey/rigdoctor/releases) and install it —
|
||||||
| Name | **RigDoctor** |
|
apt pulls the GUI dependencies (PySide6, pyte) automatically:
|
||||||
| Language / stack | **Python 3 + Qt (PySide6)** — core/CLI/daemon stdlib-only; Qt only for GUI/tray |
|
|
||||||
| Primary distro | **Ubuntu** (Debian via apt); others best-effort later |
|
|
||||||
| Primary GPU | **NVIDIA** first; AMD, then Intel later |
|
|
||||||
| MVP | **Sensor core + crash logger + health report** (NVIDIA-only, CLI-first) |
|
|
||||||
| Distribution | **User-local install** (self-updating from the public repo, no root); **`.deb`** optional |
|
|
||||||
| Scope of action | **Read-only + suggestions** (no auto-apply yet) |
|
|
||||||
| Stress tests | **Out of scope** |
|
|
||||||
|
|
||||||
Full rationale and the still-open questions are in `docs/DECISIONS.md`.
|
|
||||||
|
|
||||||
## Repo layout
|
|
||||||
|
|
||||||
| Path | Purpose |
|
|
||||||
|------|---------|
|
|
||||||
| `docs/SPEC.md` | Product specification — vision, requirements, modules (the main planning doc) |
|
|
||||||
| `docs/ARCHITECTURE.md` | Technical design — core engine, front-ends, daemon, installer |
|
|
||||||
| `docs/MODULES.md` | Catalog of modules with scope, dependencies, status |
|
|
||||||
| `docs/ROADMAP.md` | Phased milestones |
|
|
||||||
| `docs/DECISIONS.md` | Decision log + remaining open questions |
|
|
||||||
| `src/rigdoctor/` | Source code — `core/` engine + sources, `cli.py`, `render.py` |
|
|
||||||
| `installer/` | Installer / `.deb` packaging (empty until Phase 4) |
|
|
||||||
| `tests/` | Tests (stdlib `unittest`) |
|
|
||||||
|
|
||||||
## Install (user-local, no root)
|
|
||||||
|
|
||||||
RigDoctor installs into a private venv under `~/.local` — no root, self-updating:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./install.sh # from a source checkout or the self-extracting .run
|
sudo apt install ./rigdoctor_*_all.deb # CLI only: add --no-install-recommends
|
||||||
./install.sh --ref v0.0.6 # install a specific released tag (needs a token)
|
|
||||||
./install.sh --uninstall # remove it
|
|
||||||
```
|
```
|
||||||
|
|
||||||
This adds `rigdoctor` / `rigdoctor-gui` to `~/.local/bin` and a desktop entry. Each release
|
**Or add the apt repository** for `apt install` + automatic updates (the registry is public and
|
||||||
also ships a one-file **`.run`** installer (download, `chmod +x`, run). Updates are gated to
|
GPG-signed — no token needed):
|
||||||
accounts on the Git server (a Personal Access Token); save one via the GUI **Setup → Update
|
|
||||||
access** panel or `rigdoctor login`, then `rigdoctor update` (or the sidebar button).
|
|
||||||
|
|
||||||
## Install (`.deb`, system-wide)
|
|
||||||
|
|
||||||
Each release also ships a **`.deb`** (`Architecture: all`, M9/D8). Download it from the release
|
|
||||||
and install with apt (pulls the GUI deps — PySide6/pyte — via Recommends):
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo apt install ./rigdoctor_<version>_all.deb # CLI-only: add --no-install-recommends
|
sudo curl https://git.jesseyvanofferen.com/api/packages/jessey/debian/repository.key -o /etc/apt/keyrings/gitea-jessey.asc
|
||||||
|
echo "deb [arch=all signed-by=/etc/apt/keyrings/gitea-jessey.asc] https://git.jesseyvanofferen.com/api/packages/jessey/debian stable main" | sudo tee /etc/apt/sources.list.d/gitea.list
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install rigdoctor
|
||||||
```
|
```
|
||||||
|
|
||||||
When the apt registry is enabled on the server, you can instead add it as a source and
|
Then `sudo apt upgrade` keeps it current.
|
||||||
`sudo apt update && sudo apt install rigdoctor` (with `apt upgrade` for updates):
|
|
||||||
|
Then `sudo apt upgrade` keeps it current.
|
||||||
|
|
||||||
|
### Any distro — self-extracting `.run` (no root)
|
||||||
|
|
||||||
|
Download **`rigdoctor-<version>-installer.run`** from the releases page and run it. It installs
|
||||||
|
into a private virtualenv under `~/.local` (no root), adds the launchers + desktop entry, and
|
||||||
|
opens the first-run setup wizard:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -fsSL https://git.jesseyvanofferen.com/api/packages/jessey/debian/repository.key \
|
sh rigdoctor-*-installer.run
|
||||||
| sudo tee /etc/apt/keyrings/gitea-rigdoctor.asc > /dev/null
|
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/gitea-rigdoctor.asc] \
|
|
||||||
https://git.jesseyvanofferen.com/api/packages/jessey/debian stable main" \
|
|
||||||
| sudo tee /etc/apt/sources.list.d/rigdoctor.list
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run it (dev)
|
### Updating & removing
|
||||||
|
|
||||||
Stdlib-only, no install needed (target is Python ≥ 3.11; tested on 3.14):
|
- **`.deb`:** `sudo apt upgrade` (or reinstall a newer `.deb`).
|
||||||
|
- **`.run` / user-local:** the in-app **Update** button, or `rigdoctor update`.
|
||||||
|
- **Remove:** `sudo apt remove rigdoctor`, or `rigdoctor uninstall` for the user-local install.
|
||||||
|
|
||||||
|
## Using it
|
||||||
|
|
||||||
|
Launch **RigDoctor** from your app menu, or:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
PYTHONPATH=src python3 -m rigdoctor snapshot # one-shot sensor read
|
rigdoctor-gui # desktop app (+ tray)
|
||||||
PYTHONPATH=src python3 -m rigdoctor snapshot --json
|
rigdoctor --help # everything from the terminal (works over SSH)
|
||||||
PYTHONPATH=src python3 -m rigdoctor monitor -n 1 # live view (Ctrl-C to quit)
|
|
||||||
PYTHONPATH=src python3 -m rigdoctor sources # list detected sensor sources
|
|
||||||
PYTHONPATH=src python3 -m unittest discover -s tests
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Crash-capture logger (M3)
|
Handy CLI commands:
|
||||||
|
|
||||||
A crash-safe background logger (JSONL, `fsync` per sample, bounded by rotation) for catching
|
|
||||||
the state right before a freeze:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
rigdoctor record start # start logging in the background
|
rigdoctor snapshot # one-shot reading of every sensor
|
||||||
rigdoctor record status # is it running? latest readings, sample count
|
rigdoctor monitor # live terminal dashboard
|
||||||
rigdoctor record stop # stop it
|
rigdoctor report # health report (logs / SMART / driver)
|
||||||
rigdoctor record report # post-crash summary: peaks, events, last samples
|
rigdoctor diagnose start|finish # capture while gaming, then analyse
|
||||||
rigdoctor record run # run in the foreground (the systemd-ready entrypoint)
|
rigdoctor gameenv # flag risky gaming settings + fixes
|
||||||
|
rigdoctor inventory # hardware/OS inventory
|
||||||
|
rigdoctor ai explain # AI explanation of the current findings (opt-in)
|
||||||
|
rigdoctor bundle # zip the latest diagnostic into a shareable report
|
||||||
```
|
```
|
||||||
|
|
||||||
Logs live in `~/.local/share/rigdoctor/logs/`. It detects GPU "lost"/hang (nvidia-smi query
|
## Requirements
|
||||||
timeout) and writes an event marker. Trigger modes (always-on / game-launch) and the
|
|
||||||
`systemd --user` service arrive in Phase 4.
|
|
||||||
|
|
||||||
### Desktop GUI (M10)
|
- **Linux** — Ubuntu/Debian first-class (the `.deb`); the `.run` works on any distro with
|
||||||
|
Python ≥ 3.11.
|
||||||
|
- **GPU** — NVIDIA fully supported (via `nvidia-smi`); AMD/Intel sensors are best-effort.
|
||||||
|
- **CLI/daemon** need only Python 3 (stdlib). The **GUI/tray** add **PySide6** (`python3-pyside6`).
|
||||||
|
- Optional tools unlock more: `smartmontools`, `lm-sensors`, `gamemode`, `mangohud`. The setup
|
||||||
|
wizard offers to install them.
|
||||||
|
|
||||||
The GUI uses PySide6 (Qt) — the only part of RigDoctor that needs a non-stdlib dep:
|
## Privacy
|
||||||
|
|
||||||
|
Everything stays on your machine — no telemetry, no phone-home. The AI assistant is **off by
|
||||||
|
default** and runs only when you explicitly trigger it; with Ollama nothing leaves the machine,
|
||||||
|
and the Claude option asks before sending. Reports are local files; they leave only if you share
|
||||||
|
the zip.
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
RigDoctor's core is stdlib-only Python; the GUI/tray use PySide6.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -e '.[gui]' # core + PySide6, gives `rigdoctor` and `rigdoctor-gui`
|
git clone https://git.jesseyvanofferen.com/jessey/rigdoctor && cd rigdoctor
|
||||||
rigdoctor gui # or: rigdoctor-gui
|
pip install -e ".[gui]" # core + GUI; omit [gui] for CLI-only
|
||||||
|
python -m unittest discover -s tests # run the test suite
|
||||||
|
PYTHONPATH=src python3 -m rigdoctor snapshot # run without installing
|
||||||
```
|
```
|
||||||
|
|
||||||
It opens a dark-themed window with sidebar navigation and a **live dashboard** over the
|
Design docs live in `docs/` — `SPEC.md` (vision/requirements), `ARCHITECTURE.md`,
|
||||||
same sensor core — circular gauges for the headline metrics plus collapsible per-subsystem
|
`MODULES.md` (module catalog), `ROADMAP.md`, and `DECISIONS.md` (the decision log).
|
||||||
cards (GPU/CPU/memory/storage) with temperature-colored values (icey-blue → green → red).
|
Contributions: branch off `main`, keep tests green (CI runs them on PRs), and bump the version
|
||||||
The **Logs** and **Health** sections are full pages (recording controls + post-crash report;
|
+ `CHANGELOG.md` for shipped changes.
|
||||||
and the kernel-log / SMART / driver scan). **Inventory** is a placeholder until M5 lands.
|
|
||||||
|
|
||||||
Without the GUI extra, `pip install -e .` gives just the stdlib-only CLI.
|
|
||||||
|
|
||||||
## Start here
|
|
||||||
|
|
||||||
1. Read `docs/SPEC.md` for what we're building.
|
|
||||||
2. Read `docs/ROADMAP.md` for the build order (Phase 1 = the MVP).
|
|
||||||
3. Read `docs/DECISIONS.md` for the settled decisions (D1–D15).
|
|
||||||
</content>
|
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 42 KiB |
@@ -0,0 +1,17 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="512" height="512" viewBox="0 0 512 512">
|
||||||
|
<defs>
|
||||||
|
<radialGradient id="bg" cx="50%" cy="42%" r="78%">
|
||||||
|
<stop offset="0%" stop-color="#1b2230"/>
|
||||||
|
<stop offset="100%" stop-color="#0d0f13"/>
|
||||||
|
</radialGradient>
|
||||||
|
</defs>
|
||||||
|
<rect width="512" height="512" fill="url(#bg)"/>
|
||||||
|
<!-- gauge ring -->
|
||||||
|
<circle cx="256" cy="256" r="168" fill="none" stroke="#2a2f39" stroke-width="28"/>
|
||||||
|
<!-- accent sweep -->
|
||||||
|
<path d="M256 88 a168 168 0 1 1 -118.8 49.2" fill="none" stroke="#38bdf8"
|
||||||
|
stroke-width="28" stroke-linecap="round"/>
|
||||||
|
<!-- heartbeat / monitoring trace -->
|
||||||
|
<path d="M120 264 H200 L232 192 L280 336 L312 264 H392" fill="none" stroke="#e6e8eb"
|
||||||
|
stroke-width="28" stroke-linecap="round" stroke-linejoin="round"/>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 798 B |
Binary file not shown.
|
After Width: | Height: | Size: 171 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 141 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 78 KiB |
@@ -2,9 +2,13 @@
|
|||||||
|
|
||||||
Pure-Python app, so it's `Architecture: all`: we stage the package into dist-packages, drop the
|
Pure-Python app, so it's `Architecture: all`: we stage the package into dist-packages, drop the
|
||||||
two launchers in /usr/bin, install the desktop entry + icon, write a DEBIAN/control, and call
|
two launchers in /usr/bin, install the desktop entry + icon, write a DEBIAN/control, and call
|
||||||
`dpkg-deb`. The core is stdlib (`Depends: python3`); the GUI/tray deps are **Recommends**
|
`dpkg-deb`. The core is stdlib (`Depends: python3`); everything else is **Recommends** so a
|
||||||
(`python3-pyside6`, `python3-pyte`) so `apt install rigdoctor` gives the full app by default,
|
plain `apt install rigdoctor` sets up the whole toolset automatically (users never hand-install
|
||||||
while `--no-install-recommends` yields a CLI-only install.
|
deps) — the GUI modules (Debian/Ubuntu split PySide6 per module, so we name
|
||||||
|
`python3-pyside6.qt{widgets,gui,websockets,svg}`) + `python3-pyte`, plus the diagnostic/gaming
|
||||||
|
tools (smartmontools, lm-sensors, dmidecode, pciutils, libnotify-bin, libsecret-tools, gamemode,
|
||||||
|
mangohud). `--no-install-recommends` still yields a CLI-only install; `cpupower` is a Suggests
|
||||||
|
(kernel-tied/heavy).
|
||||||
|
|
||||||
Run: `python packaging/make_deb.py` → `dist/rigdoctor_<version>_all.deb`.
|
Run: `python packaging/make_deb.py` → `dist/rigdoctor_<version>_all.deb`.
|
||||||
"""
|
"""
|
||||||
@@ -57,7 +61,8 @@ Maintainer: {maintainer}
|
|||||||
Section: utils
|
Section: utils
|
||||||
Priority: optional
|
Priority: optional
|
||||||
Depends: python3 (>= 3.11)
|
Depends: python3 (>= 3.11)
|
||||||
Recommends: python3-pyside6, python3-pyte
|
Recommends: python3-pyside6.qtwidgets, python3-pyside6.qtgui, python3-pyside6.qtwebsockets, python3-pyside6.qtsvg, python3-pyte, smartmontools, lm-sensors, dmidecode, pciutils, libnotify-bin, libsecret-tools, gamemode, mangohud
|
||||||
|
Suggests: linux-tools-generic
|
||||||
Homepage: {homepage}
|
Homepage: {homepage}
|
||||||
Description: Hardware monitoring & crash diagnostics for Linux gamers
|
Description: Hardware monitoring & crash diagnostics for Linux gamers
|
||||||
RigDoctor monitors GPU/CPU temperatures, load, and sensors, captures crash
|
RigDoctor monitors GPU/CPU temperatures, load, and sensors, captures crash
|
||||||
|
|||||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "rigdoctor"
|
name = "rigdoctor"
|
||||||
version = "0.35.0"
|
version = "0.43.0"
|
||||||
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
"""RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""
|
||||||
|
|
||||||
__version__ = "0.35.0"
|
__version__ = "0.43.0"
|
||||||
|
|||||||
+135
-6
@@ -55,8 +55,9 @@ def cmd_gui(args) -> int:
|
|||||||
from .gui.app import main as gui_main
|
from .gui.app import main as gui_main
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
print("The GUI needs PySide6, which isn't installed.")
|
print("The GUI needs PySide6, which isn't installed.")
|
||||||
print(" Install it with: pip install 'rigdoctor[gui]'")
|
print(" Ubuntu/Debian: sudo apt install python3-pyside6.qtwidgets "
|
||||||
print(" or on Ubuntu: sudo apt install python3-pyside6")
|
"python3-pyside6.qtgui python3-pyside6.qtwebsockets python3-pyside6.qtsvg python3-pyte")
|
||||||
|
print(" pip: pip install 'rigdoctor[gui]'")
|
||||||
print(f" ({exc})")
|
print(f" ({exc})")
|
||||||
return 2
|
return 2
|
||||||
return gui_main([sys.argv[0]])
|
return gui_main([sys.argv[0]])
|
||||||
@@ -262,6 +263,10 @@ def cmd_update(args) -> int:
|
|||||||
print("\nWhat's new:\n" + "\n".join(" " + ln for ln in notes.splitlines()) + "\n")
|
print("\nWhat's new:\n" + "\n".join(" " + ln for ln in notes.splitlines()) + "\n")
|
||||||
if args.check:
|
if args.check:
|
||||||
return 0
|
return 0
|
||||||
|
kind = updates.install_kind()
|
||||||
|
if kind != "pip": # apt/source installs aren't pip-updatable — show the right command
|
||||||
|
print(updates.update_hint(kind))
|
||||||
|
return 0
|
||||||
print(f"Installing {tag}…")
|
print(f"Installing {tag}…")
|
||||||
rc, out = updates.apply_update(tag)
|
rc, out = updates.apply_update(tag)
|
||||||
print(out[-2000:])
|
print(out[-2000:])
|
||||||
@@ -293,10 +298,10 @@ def cmd_collect_priv(args) -> int:
|
|||||||
"""Internal: emit root-only data (SMART + dmidecode) as JSON, run via pkexec at launch."""
|
"""Internal: emit root-only data (SMART + dmidecode) as JSON, run via pkexec at launch."""
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
|
|
||||||
from .core.health import check_smart
|
from .core import drives
|
||||||
from .core.inventory import _dmidecode
|
from .core.inventory import _dmidecode
|
||||||
|
|
||||||
data = {"smart": [asdict(f) for f in check_smart()], "dmidecode": _dmidecode()}
|
data = {"drives": [asdict(d) for d in drives.collect()], "dmidecode": _dmidecode()}
|
||||||
print(json.dumps(data))
|
print(json.dumps(data))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -461,6 +466,20 @@ def cmd_ai(args) -> int:
|
|||||||
print(msg)
|
print(msg)
|
||||||
return 0 if ok else 1
|
return 0 if ok else 1
|
||||||
|
|
||||||
|
if sub == "dump":
|
||||||
|
# Parse a Windows .dmp minidump (e.g. from a Proton game crash) and explain it.
|
||||||
|
from .core import minidump
|
||||||
|
|
||||||
|
report = minidump.parse(args.file)
|
||||||
|
if not report.ok:
|
||||||
|
print(f"Couldn't analyze the dump — {report.error}")
|
||||||
|
return 1
|
||||||
|
print(minidump.to_text(report))
|
||||||
|
print(f"\nAsking {ai.provider_label()} to explain {os.path.basename(args.file)}…\n")
|
||||||
|
ok, msg = ai.explain(minidump.to_ai_text(report))
|
||||||
|
print(msg)
|
||||||
|
return 0 if ok else 1
|
||||||
|
|
||||||
# explain: gather the current health findings and ask the provider to explain them.
|
# explain: gather the current health findings and ask the provider to explain them.
|
||||||
from .core import health
|
from .core import health
|
||||||
|
|
||||||
@@ -506,13 +525,13 @@ def cmd_gameenv(args) -> int:
|
|||||||
def cmd_games(args) -> int:
|
def cmd_games(args) -> int:
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
|
|
||||||
from .core import launchers, steam
|
from .core import customgames, launchers, steam
|
||||||
|
|
||||||
selected = steam.selected_library_paths()
|
selected = steam.selected_library_paths()
|
||||||
result = steam.rescan() if selected else None
|
result = steam.rescan() if selected else None
|
||||||
steam_games = result.games if result else []
|
steam_games = result.games if result else []
|
||||||
extra = launchers.scan() # non-Steam (Lutris/Heroic)
|
extra = launchers.scan() # non-Steam (Lutris/Heroic)
|
||||||
all_games = list(steam_games) + list(extra)
|
all_games = list(steam_games) + list(extra) + customgames.scan() # + user-added (SPT etc.)
|
||||||
|
|
||||||
if args.json:
|
if args.json:
|
||||||
print(json.dumps({
|
print(json.dumps({
|
||||||
@@ -577,6 +596,91 @@ def cmd_games_libraries(args) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_games_add(args) -> int:
|
||||||
|
from .core import customgames
|
||||||
|
|
||||||
|
if customgames.add(args.name, command=args.command, logdir=args.logdir):
|
||||||
|
print(f"Added '{args.name}' to your games (custom). It'll show in `rigdoctor games` "
|
||||||
|
"and the diagnostic game picker.")
|
||||||
|
entry = customgames.get(args.name) or {}
|
||||||
|
if entry.get("command"):
|
||||||
|
print(f" launch: {entry['command']} (run with: rigdoctor games play \"{args.name}\")")
|
||||||
|
if entry.get("logdir"):
|
||||||
|
print(f" logs: {entry['logdir']} (included in crash diagnostics)")
|
||||||
|
return 0
|
||||||
|
print(f"'{args.name}' is blank or already in your custom games.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_games_play(args) -> int:
|
||||||
|
from .core import customgames, wrap
|
||||||
|
|
||||||
|
command = customgames.command(args.name)
|
||||||
|
if command is None:
|
||||||
|
if customgames.get(args.name) is None:
|
||||||
|
print(f"'{args.name}' isn't in your custom games. Add it: "
|
||||||
|
f"rigdoctor games add \"{args.name}\" --command <launch script>")
|
||||||
|
else:
|
||||||
|
print(f"'{args.name}' has no launch command. Set one: "
|
||||||
|
f"rigdoctor games remove \"{args.name}\" && rigdoctor games add \"{args.name}\" "
|
||||||
|
"--command <launch script>")
|
||||||
|
return 1
|
||||||
|
print(f"Launching '{args.name}' with crash-capture… (capture stops cleanly on exit; "
|
||||||
|
"a hard freeze is flagged next time you open RigDoctor)")
|
||||||
|
return wrap.run(command, game=args.name)
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_games_remove(args) -> int:
|
||||||
|
from .core import customgames
|
||||||
|
|
||||||
|
if customgames.remove(args.name):
|
||||||
|
print(f"Removed '{args.name}' from your custom games.")
|
||||||
|
return 0
|
||||||
|
print(f"'{args.name}' isn't in your custom games. Current: {', '.join(customgames.names()) or '(none)'}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_stress(args) -> int:
|
||||||
|
import shlex as _shlex
|
||||||
|
|
||||||
|
from .core import stress
|
||||||
|
from .render import format_raw, render_stress
|
||||||
|
|
||||||
|
command = _shlex.split(args.command) if args.command else None
|
||||||
|
if not args.json:
|
||||||
|
loaders = stress.available_loaders()
|
||||||
|
if command:
|
||||||
|
print(f"Stressing with: {' '.join(command)}")
|
||||||
|
elif loaders:
|
||||||
|
print(f"Stressing with auto-detected loader: {loaders[0]}")
|
||||||
|
else:
|
||||||
|
print("No GPU load tool found and no --command given — MONITOR-ONLY mode.")
|
||||||
|
print(f" Launch the game/app now; I'll closely track temps for up to {int(args.duration)}s.")
|
||||||
|
print(f" Sampling every {args.interval:g}s. Press Ctrl-C to stop early.\n")
|
||||||
|
|
||||||
|
def _tick(sample, elapsed) -> None:
|
||||||
|
by = {r.key: r for r in sample.readings}
|
||||||
|
bits = [f"{elapsed:5.0f}s"]
|
||||||
|
for key, tag in (("gpu.temp", "core"), ("gpu.power", "pwr"),
|
||||||
|
("gpu.util", "util"), ("gpu.clock.core", "clk")):
|
||||||
|
r = by.get(key)
|
||||||
|
if r is not None and r.value is not None:
|
||||||
|
bits.append(f"{tag} {format_raw(r.value, r.unit)}")
|
||||||
|
print(" " + " ".join(bits) + " ", end="\r", flush=True)
|
||||||
|
|
||||||
|
result = stress.run(duration=args.duration, interval=args.interval, command=command,
|
||||||
|
on_tick=None if args.json else _tick)
|
||||||
|
if not args.json:
|
||||||
|
print() # end the live line
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
from dataclasses import asdict
|
||||||
|
print(json.dumps(asdict(result), indent=2, ensure_ascii=False))
|
||||||
|
else:
|
||||||
|
print(render_stress(result))
|
||||||
|
return 0 if result.severity in ("ok", "info") else 1
|
||||||
|
|
||||||
|
|
||||||
def build_parser() -> argparse.ArgumentParser:
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
p = argparse.ArgumentParser(
|
p = argparse.ArgumentParser(
|
||||||
prog="rigdoctor",
|
prog="rigdoctor",
|
||||||
@@ -594,6 +698,14 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
mp.add_argument("--plain", action="store_true", help="plain redraw instead of the curses UI")
|
mp.add_argument("--plain", action="store_true", help="plain redraw instead of the curses UI")
|
||||||
mp.set_defaults(func=cmd_monitor)
|
mp.set_defaults(func=cmd_monitor)
|
||||||
|
|
||||||
|
st = sub.add_parser("stress", help="GPU stress + close thermal monitoring (repro load crashes)")
|
||||||
|
st.add_argument("-d", "--duration", type=float, default=120.0, help="run for this many seconds (default 120)")
|
||||||
|
st.add_argument("-n", "--interval", type=float, default=0.5, help="sampling interval in seconds (default 0.5)")
|
||||||
|
st.add_argument("--command", default=None,
|
||||||
|
help="load generator to run (e.g. a game or 'gpu-burn 60'); omit to auto-detect or monitor-only")
|
||||||
|
st.add_argument("--json", action="store_true", help="output JSON")
|
||||||
|
st.set_defaults(func=cmd_stress)
|
||||||
|
|
||||||
sub.add_parser("gui", help="launch the desktop GUI (needs PySide6)").set_defaults(func=cmd_gui)
|
sub.add_parser("gui", help="launch the desktop GUI (needs PySide6)").set_defaults(func=cmd_gui)
|
||||||
sub.add_parser("sources", help="list detected sensor sources").set_defaults(func=cmd_sources)
|
sub.add_parser("sources", help="list detected sensor sources").set_defaults(func=cmd_sources)
|
||||||
|
|
||||||
@@ -662,6 +774,20 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
lib_p.add_argument("--json", action="store_true", help="output JSON")
|
lib_p.add_argument("--json", action="store_true", help="output JSON")
|
||||||
lib_p.set_defaults(func=cmd_games_libraries)
|
lib_p.set_defaults(func=cmd_games_libraries)
|
||||||
|
|
||||||
|
add_p = games_sub.add_parser("add", help="add a game no launcher reports (e.g. SPT)")
|
||||||
|
add_p.add_argument("name", help="game name, e.g. \"SPT\"")
|
||||||
|
add_p.add_argument("--command", default=None,
|
||||||
|
help="launch command/script (e.g. the path to tarkov.sh) — enables `games play`")
|
||||||
|
add_p.add_argument("--logdir", default=None,
|
||||||
|
help="the game's own log directory (auto-detected as <command dir>/logs if present)")
|
||||||
|
add_p.set_defaults(func=cmd_games_add)
|
||||||
|
play_p = games_sub.add_parser("play", help="launch a custom game with crash-capture (e.g. SPT)")
|
||||||
|
play_p.add_argument("name", help="game name to launch")
|
||||||
|
play_p.set_defaults(func=cmd_games_play)
|
||||||
|
rm_p = games_sub.add_parser("remove", help="remove a previously added custom game")
|
||||||
|
rm_p.add_argument("name", help="game name to remove")
|
||||||
|
rm_p.set_defaults(func=cmd_games_remove)
|
||||||
|
|
||||||
env_p = sub.add_parser("gameenv", help="gaming environment checks (M6): flag stability/perf settings")
|
env_p = sub.add_parser("gameenv", help="gaming environment checks (M6): flag stability/perf settings")
|
||||||
env_p.add_argument("--json", action="store_true", help="output JSON instead of text")
|
env_p.add_argument("--json", action="store_true", help="output JSON instead of text")
|
||||||
env_p.set_defaults(func=cmd_gameenv)
|
env_p.set_defaults(func=cmd_gameenv)
|
||||||
@@ -702,6 +828,9 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
ai_sub.add_parser("status", help="show the configured provider (contacts nothing)").set_defaults(func=cmd_ai)
|
ai_sub.add_parser("status", help="show the configured provider (contacts nothing)").set_defaults(func=cmd_ai)
|
||||||
ai_sub.add_parser("test", help="send a tiny probe to verify connectivity").set_defaults(func=cmd_ai)
|
ai_sub.add_parser("test", help="send a tiny probe to verify connectivity").set_defaults(func=cmd_ai)
|
||||||
ai_sub.add_parser("explain", help="explain the current health findings with AI").set_defaults(func=cmd_ai)
|
ai_sub.add_parser("explain", help="explain the current health findings with AI").set_defaults(func=cmd_ai)
|
||||||
|
dump_p = ai_sub.add_parser("dump", help="parse a Windows .dmp crash dump and explain it with AI")
|
||||||
|
dump_p.add_argument("file", help="path to the .dmp minidump (e.g. from a Proton game crash)")
|
||||||
|
dump_p.set_defaults(func=cmd_ai)
|
||||||
ai_p.set_defaults(func=cmd_ai, ai_cmd=None)
|
ai_p.set_defaults(func=cmd_ai, ai_cmd=None)
|
||||||
|
|
||||||
bundle_p = sub.add_parser("bundle", help="zip the latest stored diagnostic into a report bundle (M15)")
|
bundle_p = sub.add_parser("bundle", help="zip the latest stored diagnostic into a report bundle (M15)")
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ SPAWN_LOG = STATE_DIR / "recorder.out"
|
|||||||
# Gaming environment / game detection (M6) — cached Steam game scan (mutable state,
|
# Gaming environment / game detection (M6) — cached Steam game scan (mutable state,
|
||||||
# not config: refreshed by the background scan on every launch).
|
# not config: refreshed by the background scan on every launch).
|
||||||
GAMES_FILE = STATE_DIR / "games.json"
|
GAMES_FILE = STATE_DIR / "games.json"
|
||||||
|
# User-added games that no launcher reports (e.g. SPT/standalone mod launchers). Authored
|
||||||
|
# by the user (not a refreshable cache), so it lives in DATA_DIR and persists across scans.
|
||||||
|
CUSTOM_GAMES_FILE = DATA_DIR / "custom-games.json"
|
||||||
|
|
||||||
# Logging & reports (opt-in via `logging_enabled`). App log: rotating file of app events.
|
# Logging & reports (opt-in via `logging_enabled`). App log: rotating file of app events.
|
||||||
# Each diagnostic is stored under DIAGNOSTICS_DIR/<id>/; "Report" zips one into REPORTS_DIR.
|
# Each diagnostic is stored under DIAGNOSTICS_DIR/<id>/; "Report" zips one into REPORTS_DIR.
|
||||||
|
|||||||
@@ -30,6 +30,14 @@ ENTRIES: list[tuple[tuple[str, ...], str]] = [
|
|||||||
(("xid 8", "xid 62", "xid 63", "xid 64"),
|
(("xid 8", "xid 62", "xid 63", "xid 64"),
|
||||||
"These Xid codes commonly indicate VRAM/ECC or memory-training problems — suspect failing "
|
"These Xid codes commonly indicate VRAM/ECC or memory-training problems — suspect failing "
|
||||||
"VRAM or an unstable memory overclock."),
|
"VRAM or an unstable memory overclock."),
|
||||||
|
(("va-space mapping", "gpu_vaspace", "dmaallocmapping", "nvkms memory for gem",
|
||||||
|
"open kernel module", "nvidia open"),
|
||||||
|
"NVIDIA open-kernel-module VA-space mapping errors (gpu_vaspace.c / dmaAllocMapping / "
|
||||||
|
"'Failed to allocate NVKMS memory for GEM object') are a driver-internal fault on the open "
|
||||||
|
"module (nvidia-*-open). They can storm for minutes and end in a HARD FREEZE with NO Xid "
|
||||||
|
"logged — so the GPU never 'falls off the bus', and this is distinct from the Xid 79 "
|
||||||
|
"hardware drop. Fix path: switch from the open to the proprietary NVIDIA kernel module and "
|
||||||
|
"update to the latest driver branch."),
|
||||||
(("smart 197", "current_pending_sector", "pending sector"),
|
(("smart 197", "current_pending_sector", "pending sector"),
|
||||||
"SMART 197 (Current Pending Sector) > 0 = sectors the drive can't read and is waiting to "
|
"SMART 197 (Current Pending Sector) > 0 = sectors the drive can't read and is waiting to "
|
||||||
"reallocate — early sign of a failing disk. Back up now and run an extended self-test."),
|
"reallocate — early sign of a failing disk. Back up now and run an extended self-test."),
|
||||||
@@ -76,6 +84,35 @@ ENTRIES: list[tuple[tuple[str, ...], str]] = [
|
|||||||
(("fork without exec", "skipping destruction"),
|
(("fork without exec", "skipping destruction"),
|
||||||
"BENIGN: 'pid X != Y, skipping destruction (fork without exec?)' is routine Steam/Proton "
|
"BENIGN: 'pid X != Y, skipping destruction (fork without exec?)' is routine Steam/Proton "
|
||||||
"process bookkeeping, not an error."),
|
"process bookkeeping, not an error."),
|
||||||
|
# --- crash-dump (.dmp) reasoning -------------------------------------------------
|
||||||
|
(("access violation", "0xc0000005", "0xc0000006"),
|
||||||
|
"Windows exception 0xC0000005 (access violation) = the game read/wrote/executed memory it "
|
||||||
|
"wasn't allowed to. A write/read to a low address (near 0x0) is a null-pointer dereference, "
|
||||||
|
"usually a game or graphics-driver bug; under Proton it's often a DXVK/VKD3D or Proton-version "
|
||||||
|
"issue. Identify the faulting MODULE to localize the fault."),
|
||||||
|
(("stack overflow", "0xc00000fd"),
|
||||||
|
"Windows exception 0xC00000FD (stack overflow) = unbounded recursion or a huge stack "
|
||||||
|
"allocation in the crashing module — almost always a software bug in that module."),
|
||||||
|
(("0xc0000409", "stack buffer overrun", "fast fail"),
|
||||||
|
"Windows 0xC0000409 (stack buffer overrun / __fastfail) = a security check tripped on memory "
|
||||||
|
"corruption; frequently anticheat or a DRM/overlay injecting into the game. Suspect overlays "
|
||||||
|
"(Steam/Discord/MSI Afterburner-equivalents) and anticheat compatibility under Proton."),
|
||||||
|
(("0xc0000374", "heap corruption"),
|
||||||
|
"Windows 0xC0000374 (heap corruption) = something scribbled over heap memory earlier; the "
|
||||||
|
"crash point is a symptom, not the cause. Often a mod, an injected overlay, or unstable RAM."),
|
||||||
|
(("nvwgf2umx", "nvoglv", "nvd3dum", "nvldumd"),
|
||||||
|
"A faulting NVIDIA user-mode driver DLL (nvwgf2umx/nvoglv/nvd3dum) means the crash happened "
|
||||||
|
"inside the GPU driver under Proton. On Linux this points at the NVIDIA driver + the "
|
||||||
|
"DXVK/VKD3D translation layer: try a different driver branch or Proton/Proton-GE version, "
|
||||||
|
"clear the DXVK shader cache, and revert any GPU overclock/undervolt."),
|
||||||
|
(("easyanticheat", "eac", "battleye", "beclient", "anticheat"),
|
||||||
|
"A faulting anticheat module (EasyAntiCheat/BattlEye) under Proton is usually a compatibility "
|
||||||
|
"problem: confirm the title's anticheat has Proton/Linux support enabled and try the Proton "
|
||||||
|
"version the community recommends for it (often Proton-GE or a specific Valve build)."),
|
||||||
|
(("d3d11.dll", "d3d12.dll", "dxgi.dll", "d3d9.dll", "dxvk", "vkd3d"),
|
||||||
|
"A crash in a Direct3D/DXGI module under Proton runs through DXVK (D3D9/10/11) or VKD3D-Proton "
|
||||||
|
"(D3D12). Try a known-good Proton version, update/override DXVK-VKD3D, clear the shader cache, "
|
||||||
|
"and check the GPU driver — these are the usual fixes for D3D faults on Linux."),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,113 @@
|
|||||||
|
"""User-added games (M6): a manual list for titles no launcher reports.
|
||||||
|
|
||||||
|
Some games never show up in a Steam/Lutris/Heroic scan — standalone mod launchers like
|
||||||
|
**SPT** (Single-Player Tarkov), itch.io downloads, or any hand-installed executable. This
|
||||||
|
module keeps a small user-authored list so those still appear in the game list and can be
|
||||||
|
picked for a focused diagnostic, in the same `steam.Game` shape as every other source.
|
||||||
|
|
||||||
|
Each entry is a name plus two optionals: a **launch command** (so `rigdoctor games play`
|
||||||
|
can start it under the auto-capture wrapper) and a **log directory** (so a crash diagnostic
|
||||||
|
can read the game's own logs — e.g. SPT's `logs/tarkov-latest.log`). Stored as JSON in
|
||||||
|
`config.CUSTOM_GAMES_FILE`; stdlib only; every reader degrades to [] on a missing/bad file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shlex
|
||||||
|
|
||||||
|
from .. import config
|
||||||
|
from .steam import Game
|
||||||
|
|
||||||
|
LAUNCHER = "custom"
|
||||||
|
|
||||||
|
|
||||||
|
def _load() -> list[dict]:
|
||||||
|
try:
|
||||||
|
data = json.loads(config.CUSTOM_GAMES_FILE.read_text())
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return []
|
||||||
|
games = data.get("games") if isinstance(data, dict) else None
|
||||||
|
return [g for g in games if isinstance(g, dict) and g.get("name")] if isinstance(games, list) else []
|
||||||
|
|
||||||
|
|
||||||
|
def _save(games: list[dict]) -> None:
|
||||||
|
config.CUSTOM_GAMES_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
config.CUSTOM_GAMES_FILE.write_text(json.dumps({"games": games}, indent=2, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def names() -> list[str]:
|
||||||
|
"""Just the stored names (insertion order preserved)."""
|
||||||
|
return [str(g["name"]) for g in _load()]
|
||||||
|
|
||||||
|
|
||||||
|
def get(name: str) -> dict | None:
|
||||||
|
"""The stored entry (name + optional command/logdir) for a game, case-insensitive."""
|
||||||
|
name = (name or "").strip().lower()
|
||||||
|
return next((g for g in _load() if str(g["name"]).lower() == name), None)
|
||||||
|
|
||||||
|
|
||||||
|
def add(name: str, command: str | None = None, logdir: str | None = None) -> bool:
|
||||||
|
"""Add a game by name, with an optional launch command and log directory.
|
||||||
|
|
||||||
|
Returns False if the name is blank or already present (case-insensitive). When a command
|
||||||
|
is given but no logdir, a sibling `logs/` dir is inferred if it exists (covers SPT's layout).
|
||||||
|
"""
|
||||||
|
name = (name or "").strip()
|
||||||
|
if not name:
|
||||||
|
return False
|
||||||
|
if get(name):
|
||||||
|
return False
|
||||||
|
entry: dict = {"name": name}
|
||||||
|
command = (command or "").strip()
|
||||||
|
if command:
|
||||||
|
entry["command"] = command
|
||||||
|
if not logdir:
|
||||||
|
sibling = os.path.join(os.path.dirname(_argv0(command)), "logs")
|
||||||
|
if os.path.isdir(sibling):
|
||||||
|
logdir = sibling
|
||||||
|
logdir = (logdir or "").strip()
|
||||||
|
if logdir:
|
||||||
|
entry["logdir"] = os.path.expanduser(logdir)
|
||||||
|
games = _load()
|
||||||
|
games.append(entry)
|
||||||
|
_save(games)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def remove(name: str) -> bool:
|
||||||
|
"""Remove a game by name (case-insensitive). Returns True if one was removed."""
|
||||||
|
name = (name or "").strip().lower()
|
||||||
|
games = _load()
|
||||||
|
kept = [g for g in games if str(g["name"]).lower() != name]
|
||||||
|
if len(kept) == len(games):
|
||||||
|
return False
|
||||||
|
_save(kept)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _argv0(command: str) -> str:
|
||||||
|
parts = shlex.split(command)
|
||||||
|
return parts[0] if parts else command
|
||||||
|
|
||||||
|
|
||||||
|
def command(name: str) -> list[str] | None:
|
||||||
|
"""The launch argv for a game (shlex-split), or None if it has no command."""
|
||||||
|
entry = get(name)
|
||||||
|
cmd = (entry or {}).get("command")
|
||||||
|
return shlex.split(cmd) if cmd else None
|
||||||
|
|
||||||
|
|
||||||
|
def log_dir(name: str) -> str | None:
|
||||||
|
"""The game's own log directory, or None if it isn't set / doesn't exist."""
|
||||||
|
entry = get(name)
|
||||||
|
path = (entry or {}).get("logdir")
|
||||||
|
return path if path and os.path.isdir(path) else None
|
||||||
|
|
||||||
|
|
||||||
|
def scan() -> list[Game]:
|
||||||
|
"""User-added games as `Game` objects (launcher='custom'), sorted by name."""
|
||||||
|
out = [Game(appid="", name=str(g["name"]), library="", installdir="", launcher=LAUNCHER)
|
||||||
|
for g in _load()]
|
||||||
|
return sorted(out, key=lambda g: g.name.lower())
|
||||||
@@ -75,7 +75,7 @@ def store(result, capture_path=None, since: float | None = None) -> Path | None:
|
|||||||
_write(target / "report.txt", "\n".join(report))
|
_write(target / "report.txt", "\n".join(report))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logs = gamelogs.collect(since=since)
|
logs = gamelogs.collect(since=since, game=getattr(result, "game", None))
|
||||||
if logs:
|
if logs:
|
||||||
_write(target / "gamelogs.txt", logs)
|
_write(target / "gamelogs.txt", logs)
|
||||||
except OSError:
|
except OSError:
|
||||||
|
|||||||
@@ -0,0 +1,148 @@
|
|||||||
|
"""Connected displays (M5): resolution + current/max refresh per monitor.
|
||||||
|
|
||||||
|
GNOME exposes the authoritative data over D-Bus (Mutter `DisplayConfig.GetCurrentState`),
|
||||||
|
which works on both X11 and Wayland — read via `busctl --json`. Plain X11 desktops fall back
|
||||||
|
to `xrandr`. Other Wayland compositors (sway/KDE) aren't covered yet and degrade to empty.
|
||||||
|
Stdlib only; every probe fails soft. Max refresh is computed at the *current* resolution, so
|
||||||
|
"can go faster" never suggests dropping resolution.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
# A few common PNP monitor-vendor IDs → friendly names (best-effort; unknown codes pass through).
|
||||||
|
_PNP = {
|
||||||
|
"SAM": "Samsung", "DEL": "Dell", "GSM": "LG", "LGD": "LG", "AUS": "ASUS", "ACR": "Acer",
|
||||||
|
"BNQ": "BenQ", "MSI": "MSI", "AOC": "AOC", "VSC": "ViewSonic", "HWP": "HP", "HPN": "HP",
|
||||||
|
"PHL": "Philips", "GBT": "Gigabyte", "APP": "Apple", "DGC": "Dell",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Monitor:
|
||||||
|
connector: str # e.g. "DP-1"
|
||||||
|
name: str # e.g. "Samsung LC34G55T" ("" if unknown, e.g. xrandr)
|
||||||
|
width: int
|
||||||
|
height: int
|
||||||
|
refresh: float # current Hz
|
||||||
|
max_refresh: float # max Hz available at the current resolution
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_go_faster(self) -> bool:
|
||||||
|
"""True if a meaningfully higher refresh is available at the current resolution."""
|
||||||
|
return self.max_refresh - self.refresh > 1.0
|
||||||
|
|
||||||
|
def label(self) -> str:
|
||||||
|
return f"{self.connector} · {self.name}".rstrip(" ·") if self.name else self.connector
|
||||||
|
|
||||||
|
|
||||||
|
def _run(cmd: list[str], timeout: float = 8.0) -> str:
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||||
|
if proc.returncode == 0:
|
||||||
|
return proc.stdout
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
pass
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_mutter(out: str) -> list[Monitor]:
|
||||||
|
"""Parse `busctl --json` output of Mutter DisplayConfig.GetCurrentState.
|
||||||
|
|
||||||
|
data = [serial, monitors, logical_monitors, props]; each monitor is
|
||||||
|
[[connector, vendor, product, serial], [modes], props]; each mode is
|
||||||
|
[id, width, height, refresh, scale, [scales], {props}] where props may hold is-current.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = json.loads(out)["data"]
|
||||||
|
raw_monitors = data[1]
|
||||||
|
except (json.JSONDecodeError, KeyError, IndexError, TypeError):
|
||||||
|
return []
|
||||||
|
monitors: list[Monitor] = []
|
||||||
|
for mon in raw_monitors:
|
||||||
|
try:
|
||||||
|
connector, vendor, product = mon[0][0], mon[0][1], mon[0][2]
|
||||||
|
modes = mon[1]
|
||||||
|
except (IndexError, TypeError):
|
||||||
|
continue
|
||||||
|
current = None
|
||||||
|
for m in modes:
|
||||||
|
props = m[6] if len(m) > 6 and isinstance(m[6], dict) else {}
|
||||||
|
if (props.get("is-current") or {}).get("data"):
|
||||||
|
current = m
|
||||||
|
break
|
||||||
|
if current is None:
|
||||||
|
continue
|
||||||
|
w, h, r = int(current[1]), int(current[2]), float(current[3])
|
||||||
|
max_r = max((float(m[3]) for m in modes if int(m[1]) == w and int(m[2]) == h), default=r)
|
||||||
|
name = f"{_PNP.get(vendor, vendor)} {product}".strip()
|
||||||
|
monitors.append(Monitor(connector, name, w, h, r, max_r))
|
||||||
|
return monitors
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_xrandr(out: str) -> list[Monitor]:
|
||||||
|
"""Parse `xrandr --query`: an output line with the active WxH+x+y, then indented mode lines
|
||||||
|
whose rates carry `*` for the current one."""
|
||||||
|
monitors: list[Monitor] = []
|
||||||
|
out_re = re.compile(r"^(\S+) connected.*?(\d+)x(\d+)\+\d+\+\d+")
|
||||||
|
mode_re = re.compile(r"^\s+(\d+)x(\d+)\s+(.+)$")
|
||||||
|
name = ""
|
||||||
|
cw = ch = 0
|
||||||
|
cur_r = max_r = 0.0
|
||||||
|
|
||||||
|
def flush() -> None:
|
||||||
|
if name and cw and cur_r:
|
||||||
|
monitors.append(Monitor(name, "", cw, ch, cur_r, max_r or cur_r))
|
||||||
|
|
||||||
|
for line in out.splitlines():
|
||||||
|
mo = out_re.match(line)
|
||||||
|
if mo:
|
||||||
|
flush()
|
||||||
|
name, cw, ch = mo.group(1), int(mo.group(2)), int(mo.group(3))
|
||||||
|
cur_r = max_r = 0.0
|
||||||
|
continue
|
||||||
|
mm = mode_re.match(line)
|
||||||
|
if mm and name and int(mm.group(1)) == cw and int(mm.group(2)) == ch:
|
||||||
|
for tok in mm.group(3).split():
|
||||||
|
try:
|
||||||
|
rate = float(tok.rstrip("*+"))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
max_r = max(max_r, rate)
|
||||||
|
if "*" in tok:
|
||||||
|
cur_r = rate
|
||||||
|
flush()
|
||||||
|
return monitors
|
||||||
|
|
||||||
|
|
||||||
|
def _mutter() -> list[Monitor]:
|
||||||
|
exe = shutil.which("busctl")
|
||||||
|
if not exe:
|
||||||
|
return []
|
||||||
|
out = _run([exe, "--user", "--json=short", "call", "org.gnome.Mutter.DisplayConfig",
|
||||||
|
"/org/gnome/Mutter/DisplayConfig", "org.gnome.Mutter.DisplayConfig",
|
||||||
|
"GetCurrentState"])
|
||||||
|
return _parse_mutter(out) if out.strip() else []
|
||||||
|
|
||||||
|
|
||||||
|
def _xrandr() -> list[Monitor]:
|
||||||
|
if not shutil.which("xrandr"):
|
||||||
|
return []
|
||||||
|
return _parse_xrandr(_run(["xrandr", "--query"]))
|
||||||
|
|
||||||
|
|
||||||
|
def collect() -> list[Monitor]:
|
||||||
|
"""Connected monitors, via the first backend that returns any (Mutter, then xrandr)."""
|
||||||
|
for backend in (_mutter, _xrandr):
|
||||||
|
try:
|
||||||
|
monitors = backend()
|
||||||
|
except Exception:
|
||||||
|
monitors = []
|
||||||
|
if monitors:
|
||||||
|
return monitors
|
||||||
|
return []
|
||||||
@@ -0,0 +1,229 @@
|
|||||||
|
"""Drive health & wear (M-drives): per-disk SMART stats parsed from smartctl JSON.
|
||||||
|
|
||||||
|
Unlike a GPU, storage exposes a real health/wear story, so this reads it in full: the overall
|
||||||
|
SMART verdict, a derived **life-left %** (NVMe ``percentage_used`` or the SATA wear-leveling
|
||||||
|
attribute), **power-on hours** (the drive's runtime), data written (TBW), temperature, and the
|
||||||
|
early-failure predictors (reallocated / pending / offline-uncorrectable sectors, NVMe media
|
||||||
|
errors, available spare). Turned into prioritized health findings.
|
||||||
|
|
||||||
|
smartctl needs root, so collection runs through the same elevated path as the other root-only
|
||||||
|
checks (``rigdoctor collect-priv`` via pkexec at GUI launch, or ``sudo rigdoctor report``).
|
||||||
|
Parsing is JSON-based (smartctl ``--json``), which is stable across drive types. Stdlib only;
|
||||||
|
degrades gracefully — no smartctl, no root, or an unparseable device yields an info finding.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from .health import CRITICAL, INFO, OK, WARNING, Finding
|
||||||
|
|
||||||
|
# NVMe writes are counted in 512-KB "data units"; 1 unit = 1000 * 512 bytes.
|
||||||
|
_NVME_UNIT_BYTES = 512_000
|
||||||
|
_LBA_BYTES = 512 # SATA Total_LBAs_Written counts 512-byte sectors
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DriveHealth:
|
||||||
|
device: str
|
||||||
|
model: str = ""
|
||||||
|
kind: str = "" # "nvme" | "sata" | "scsi"
|
||||||
|
passed: bool | None = None # SMART overall verdict; None if unknown / needs root
|
||||||
|
needs_root: bool = False
|
||||||
|
health_pct: int | None = None # derived life-left %
|
||||||
|
percent_used: int | None = None # NVMe wear used %
|
||||||
|
power_on_hours: int | None = None
|
||||||
|
temp_c: int | None = None
|
||||||
|
data_written_tb: float | None = None
|
||||||
|
reallocated: int | None = None # SATA reallocated sectors (id 5)
|
||||||
|
pending: int | None = None # SATA current-pending sectors (id 197)
|
||||||
|
offline_uncorrectable: int | None = None # SATA id 198
|
||||||
|
available_spare: int | None = None # NVMe %
|
||||||
|
available_spare_threshold: int | None = None
|
||||||
|
media_errors: int | None = None # NVMe
|
||||||
|
|
||||||
|
|
||||||
|
# --- collection (root) ----------------------------------------------------------------
|
||||||
|
|
||||||
|
def _scan_devices() -> list[str]:
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10)
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
return []
|
||||||
|
return [ln.split()[0] for ln in proc.stdout.splitlines() if ln.strip().startswith("/dev/")]
|
||||||
|
|
||||||
|
|
||||||
|
def _smartctl_json(device: str) -> dict | None:
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
["smartctl", "--json=c", "-H", "-A", "-i", device],
|
||||||
|
capture_output=True, text=True, timeout=20,
|
||||||
|
)
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(proc.stdout)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _ata_attr(data: dict, attr_id: int) -> int | None:
|
||||||
|
for row in data.get("ata_smart_attributes", {}).get("table", []):
|
||||||
|
if row.get("id") == attr_id:
|
||||||
|
raw = row.get("raw", {})
|
||||||
|
return raw.get("value")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _ata_norm_value(data: dict, attr_id: int) -> int | None:
|
||||||
|
"""The normalized 'value' (100→0 life indicator) for an ATA attribute."""
|
||||||
|
for row in data.get("ata_smart_attributes", {}).get("table", []):
|
||||||
|
if row.get("id") == attr_id:
|
||||||
|
return row.get("value")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse(device: str, data: dict | None) -> DriveHealth:
|
||||||
|
"""Build a DriveHealth from smartctl JSON (pure-ish; no IO of its own)."""
|
||||||
|
d = DriveHealth(device=device)
|
||||||
|
if not data:
|
||||||
|
d.needs_root = True
|
||||||
|
return d
|
||||||
|
|
||||||
|
d.model = data.get("model_name") or data.get("scsi_model_name") or ""
|
||||||
|
proto = (data.get("device", {}).get("protocol") or "").lower()
|
||||||
|
d.kind = "nvme" if "nvme" in proto else ("sata" if "ata" in proto else (proto or ""))
|
||||||
|
|
||||||
|
status = data.get("smart_status")
|
||||||
|
if isinstance(status, dict) and "passed" in status:
|
||||||
|
d.passed = bool(status["passed"])
|
||||||
|
else:
|
||||||
|
# No verdict and a non-zero exit usually means we couldn't open the device (needs root).
|
||||||
|
if data.get("smartctl", {}).get("exit_status", 0) and not status:
|
||||||
|
d.needs_root = True
|
||||||
|
|
||||||
|
temp = data.get("temperature", {}).get("current")
|
||||||
|
d.temp_c = int(temp) if isinstance(temp, (int, float)) else None
|
||||||
|
poh = data.get("power_on_time", {}).get("hours")
|
||||||
|
d.power_on_hours = int(poh) if isinstance(poh, (int, float)) else None
|
||||||
|
|
||||||
|
if d.kind == "nvme":
|
||||||
|
log = data.get("nvme_smart_health_information_log", {})
|
||||||
|
d.percent_used = log.get("percentage_used")
|
||||||
|
d.available_spare = log.get("available_spare")
|
||||||
|
d.available_spare_threshold = log.get("available_spare_threshold")
|
||||||
|
d.media_errors = log.get("media_errors")
|
||||||
|
if d.temp_c is None and isinstance(log.get("temperature"), (int, float)):
|
||||||
|
d.temp_c = int(log["temperature"])
|
||||||
|
units = log.get("data_units_written")
|
||||||
|
if isinstance(units, (int, float)):
|
||||||
|
d.data_written_tb = round(units * _NVME_UNIT_BYTES / 1e12, 2)
|
||||||
|
if isinstance(d.percent_used, (int, float)):
|
||||||
|
d.health_pct = max(0, 100 - int(d.percent_used))
|
||||||
|
else: # SATA / ATA
|
||||||
|
d.reallocated = _ata_attr(data, 5)
|
||||||
|
d.pending = _ata_attr(data, 197)
|
||||||
|
d.offline_uncorrectable = _ata_attr(data, 198)
|
||||||
|
lbas = _ata_attr(data, 241) # Total_LBAs_Written
|
||||||
|
if isinstance(lbas, (int, float)) and lbas > 0:
|
||||||
|
d.data_written_tb = round(lbas * _LBA_BYTES / 1e12, 2)
|
||||||
|
wear = _ata_norm_value(data, 177) # Wear_Leveling_Count (Samsung): normalized = life left
|
||||||
|
if wear is None:
|
||||||
|
wear = _ata_norm_value(data, 231) # SSD_Life_Left on some drives
|
||||||
|
if isinstance(wear, int):
|
||||||
|
d.health_pct = wear
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def collect() -> list[DriveHealth]:
|
||||||
|
"""Per-drive health for every SMART-capable device (needs root for real data)."""
|
||||||
|
if shutil.which("smartctl") is None:
|
||||||
|
return []
|
||||||
|
return [parse(dev, _smartctl_json(dev)) for dev in _scan_devices()]
|
||||||
|
|
||||||
|
|
||||||
|
def from_dicts(rows: list[dict]) -> list[DriveHealth]:
|
||||||
|
"""Rebuild DriveHealth objects from the privileged collector's JSON."""
|
||||||
|
out: list[DriveHealth] = []
|
||||||
|
for r in rows:
|
||||||
|
if isinstance(r, dict) and r.get("device"):
|
||||||
|
fields = {k: r.get(k) for k in DriveHealth.__dataclass_fields__}
|
||||||
|
out.append(DriveHealth(**fields))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# --- findings -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _stats_line(d: DriveHealth) -> str:
|
||||||
|
parts: list[str] = []
|
||||||
|
if d.health_pct is not None:
|
||||||
|
parts.append(f"{d.health_pct}% life left")
|
||||||
|
elif d.percent_used is not None:
|
||||||
|
parts.append(f"{d.percent_used}% used")
|
||||||
|
if d.power_on_hours is not None:
|
||||||
|
parts.append(f"{d.power_on_hours:,} h powered on")
|
||||||
|
if d.data_written_tb is not None:
|
||||||
|
parts.append(f"{d.data_written_tb:g} TB written")
|
||||||
|
if d.temp_c is not None:
|
||||||
|
parts.append(f"{d.temp_c}°C")
|
||||||
|
if d.available_spare is not None:
|
||||||
|
parts.append(f"spare {d.available_spare}%")
|
||||||
|
return " · ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def to_findings(drives: list[DriveHealth]) -> list[Finding]:
|
||||||
|
if not drives:
|
||||||
|
if shutil.which("smartctl") is None:
|
||||||
|
return [Finding(INFO, "Storage", "SMART not checked (smartmontools missing)",
|
||||||
|
"Disk self-health couldn't be read.",
|
||||||
|
"Install it: `sudo apt install smartmontools`")]
|
||||||
|
return []
|
||||||
|
findings: list[Finding] = []
|
||||||
|
for d in drives:
|
||||||
|
name = d.model or d.device
|
||||||
|
if d.needs_root:
|
||||||
|
findings.append(Finding(INFO, "Storage", f"{name}: SMART needs root",
|
||||||
|
"Reading drive health requires elevated access.",
|
||||||
|
"Run: `sudo rigdoctor report` (or launch the GUI, which asks once)."))
|
||||||
|
continue
|
||||||
|
|
||||||
|
stats = _stats_line(d)
|
||||||
|
# Severity from the failure predictors, worst first.
|
||||||
|
bad = []
|
||||||
|
if d.passed is False:
|
||||||
|
bad.append("SMART overall self-assessment FAILED")
|
||||||
|
for label, val in (("reallocated sectors", d.reallocated),
|
||||||
|
("pending sectors", d.pending),
|
||||||
|
("offline-uncorrectable sectors", d.offline_uncorrectable),
|
||||||
|
("NVMe media errors", d.media_errors)):
|
||||||
|
if isinstance(val, int) and val > 0:
|
||||||
|
bad.append(f"{val} {label}")
|
||||||
|
spare_low = (isinstance(d.available_spare, int) and isinstance(d.available_spare_threshold, int)
|
||||||
|
and d.available_spare < d.available_spare_threshold)
|
||||||
|
worn = isinstance(d.percent_used, int) and d.percent_used >= 90
|
||||||
|
hot = isinstance(d.temp_c, int) and d.temp_c >= 70
|
||||||
|
|
||||||
|
if d.passed is False or bad:
|
||||||
|
findings.append(Finding(
|
||||||
|
CRITICAL, "Storage", f"{name}: failing ({stats})" if stats else f"{name}: failing",
|
||||||
|
"; ".join(bad) + ".",
|
||||||
|
"Back up this drive now and plan to replace it."))
|
||||||
|
elif spare_low or worn:
|
||||||
|
findings.append(Finding(
|
||||||
|
WARNING, "Storage", f"{name}: worn ({stats})",
|
||||||
|
("Available spare below the drive's threshold." if spare_low else
|
||||||
|
f"NVMe wear at {d.percent_used}% used — near end of rated life."),
|
||||||
|
"Back up important data and budget for a replacement."))
|
||||||
|
elif hot:
|
||||||
|
findings.append(Finding(
|
||||||
|
WARNING, "Storage", f"{name}: hot ({stats})",
|
||||||
|
f"Drive temperature is {d.temp_c}°C.",
|
||||||
|
"Improve case/M.2 airflow; sustained heat shortens SSD life."))
|
||||||
|
else:
|
||||||
|
findings.append(Finding(
|
||||||
|
OK, "Storage", f"{name}: healthy" + (f" ({stats})" if stats else ""),
|
||||||
|
"SMART self-assessment passed." if d.passed else ""))
|
||||||
|
return findings
|
||||||
@@ -81,15 +81,48 @@ def available() -> bool:
|
|||||||
return bool(_proton_logs() or _steam_console())
|
return bool(_proton_logs() or _steam_console())
|
||||||
|
|
||||||
|
|
||||||
def collect(since: float | None = None, max_bytes: int = 8000) -> str:
|
def _custom_game_logs(game: str, since: float | None, max_bytes: int) -> list[str]:
|
||||||
"""Recent Proton + Steam log tails as one labelled text block ('' if none).
|
"""Tail the recent ``*.log`` files in a custom game's own log dir (e.g. SPT's
|
||||||
|
``logs/tarkov-latest.log`` + ``server-latest.log``), newest first, freshness-scoped by mtime.
|
||||||
|
|
||||||
|
Custom-game logs use their own timestamp formats, so we scope by file mtime (like the Proton
|
||||||
|
log) rather than the ``[YYYY-MM-DD …]`` line filter used for the Steam console.
|
||||||
|
"""
|
||||||
|
from . import customgames
|
||||||
|
|
||||||
|
directory = customgames.log_dir(game)
|
||||||
|
if not directory:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
files = [p for p in Path(directory).glob("*.log") if p.is_file()]
|
||||||
|
except OSError:
|
||||||
|
return []
|
||||||
|
files.sort(key=_mtime, reverse=True)
|
||||||
|
sections: list[str] = []
|
||||||
|
for log in files[:4]: # a session touches a handful (tarkov/server/launcher latest)
|
||||||
|
if since is not None and _mtime(log) < since:
|
||||||
|
continue
|
||||||
|
tail = _tail(log, max_bytes).strip()
|
||||||
|
if tail:
|
||||||
|
sections.append(f"--- {game} log ({log.name}) ---\n{tail}")
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def collect(since: float | None = None, max_bytes: int = 8000, game: str | None = None) -> str:
|
||||||
|
"""Recent Proton + Steam (+ custom-game) log tails as one labelled text block ('' if none).
|
||||||
|
|
||||||
With ``since`` (epoch), scope to that session: skip a Proton log not written during/after
|
With ``since`` (epoch), scope to that session: skip a Proton log not written during/after
|
||||||
the session (a stale per-app log from an earlier game), and keep only Steam-console lines
|
the session (a stale per-app log from an earlier game), and keep only Steam-console lines
|
||||||
timestamped at/after ``since`` — so we don't feed the model an unrelated past session.
|
timestamped at/after ``since`` — so we don't feed the model an unrelated past session.
|
||||||
|
|
||||||
|
``game`` (the diagnostic's focused title) pulls in that custom game's own logs if it has a
|
||||||
|
registered log dir — e.g. SPT's server/launcher logs, which Steam/Proton never see.
|
||||||
"""
|
"""
|
||||||
sections: list[str] = []
|
sections: list[str] = []
|
||||||
|
|
||||||
|
if game:
|
||||||
|
sections += _custom_game_logs(game, since, max_bytes)
|
||||||
|
|
||||||
protons = _proton_logs()
|
protons = _proton_logs()
|
||||||
if protons:
|
if protons:
|
||||||
log = protons[0]
|
log = protons[0]
|
||||||
|
|||||||
+159
-45
@@ -116,6 +116,31 @@ def scan_journal_text(text: str) -> list[Finding]:
|
|||||||
"Check power/thermals/driver; capture a session with `rigdoctor record`.",
|
"Check power/thermals/driver; capture a session with `rigdoctor record`.",
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# NVIDIA open-kernel-module VA-space mapping faults: a driver-internal failure that can
|
||||||
|
# storm for minutes and end in a HARD FREEZE with NO Xid logged — the GPU never "falls off
|
||||||
|
# the bus", so the Xid scan above misses it entirely. These code paths live in the open
|
||||||
|
# kernel module (nvidia-*-open); the proprietary module doesn't hit them.
|
||||||
|
nvrm_va = [
|
||||||
|
ln for ln in lines
|
||||||
|
if "gpu_vaspace.c" in ln
|
||||||
|
or "_gvaspaceMappingInsert" in ln
|
||||||
|
or "dmaAllocMapping" in ln
|
||||||
|
or "NVKMS memory for GEM object" in ln
|
||||||
|
]
|
||||||
|
if nvrm_va:
|
||||||
|
findings.append(Finding(
|
||||||
|
WARNING, "GPU", f"NVIDIA driver VA-space mapping errors ×{len(nvrm_va)}",
|
||||||
|
"The NVIDIA kernel module repeatedly failed to update the GPU's virtual address "
|
||||||
|
"space (gpu_vaspace / dmaAllocMapping assertions, NVKMS GEM-allocation failures). "
|
||||||
|
"This is a driver-internal fault that can recur for minutes and end in a hard freeze "
|
||||||
|
"with NO Xid logged — distinct from an Xid 79 hardware drop. These code paths are "
|
||||||
|
"specific to the open kernel module (nvidia-*-open).",
|
||||||
|
"If you're on the open module, switch to the proprietary NVIDIA driver "
|
||||||
|
"(install `nvidia-driver-###` instead of the `…-open` variant) and update to the "
|
||||||
|
"latest branch, then reboot. Capture a session with `rigdoctor record` to confirm "
|
||||||
|
"the errors precede the freeze.",
|
||||||
|
))
|
||||||
|
|
||||||
return findings
|
return findings
|
||||||
|
|
||||||
|
|
||||||
@@ -188,47 +213,66 @@ def check_nvidia_driver() -> list[Finding]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _smart_devices() -> list[str]:
|
def _read_text(path: str) -> str | None:
|
||||||
try:
|
try:
|
||||||
proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10)
|
return Path(path).read_text()
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _nvidia_module_is_open() -> bool | None:
|
||||||
|
"""Whether the *loaded* NVIDIA kernel module is the open-source flavor.
|
||||||
|
|
||||||
|
True = open (nvidia-*-open), False = proprietary, None = can't tell / no NVIDIA module.
|
||||||
|
/proc is authoritative for the loaded module and needs no external tool; modinfo's filename
|
||||||
|
(…/nvidia-###-open/nvidia.ko) is the fallback.
|
||||||
|
"""
|
||||||
|
proc = _read_text("/proc/driver/nvidia/version")
|
||||||
|
if proc:
|
||||||
|
low = proc.lower()
|
||||||
|
if "open kernel module" in low:
|
||||||
|
return True
|
||||||
|
if "kernel module" in low: # proprietary banner: "NVIDIA UNIX … Kernel Module …"
|
||||||
|
return False
|
||||||
|
if shutil.which("modinfo"):
|
||||||
|
try:
|
||||||
|
out = subprocess.run(["modinfo", "nvidia"], capture_output=True, text=True, timeout=10).stdout
|
||||||
except (subprocess.SubprocessError, OSError):
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
out = ""
|
||||||
|
for line in out.splitlines():
|
||||||
|
if line.startswith("filename:"):
|
||||||
|
return "-open" in line
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_nvidia_module() -> list[Finding]:
|
||||||
|
"""Note when the open-source NVIDIA kernel module is loaded — the context behind the no-Xid
|
||||||
|
VA-space freeze signature, which lives in the open module's code paths (suggestion-only)."""
|
||||||
|
if _nvidia_module_is_open() is not True:
|
||||||
return []
|
return []
|
||||||
devices = []
|
return [Finding(
|
||||||
for line in proc.stdout.splitlines():
|
INFO, "Driver", "NVIDIA open kernel module in use",
|
||||||
line = line.strip()
|
"The loaded NVIDIA driver is the open-source kernel module (nvidia-*-open). It's fine for "
|
||||||
if line.startswith("/dev/"):
|
"most setups, but on some GeForce cards it hits driver-internal faults (VA-space mapping "
|
||||||
devices.append(line.split()[0])
|
"errors, hard freezes with no Xid) that the proprietary module doesn't.",
|
||||||
return devices
|
"If you get unexplained hard freezes with no Xid in the logs, try the proprietary NVIDIA "
|
||||||
|
"driver (`nvidia-driver-###` rather than the `…-open` variant) on the latest branch.",
|
||||||
|
)]
|
||||||
|
|
||||||
|
|
||||||
def check_smart() -> list[Finding]:
|
def check_drives() -> list[Finding]:
|
||||||
if shutil.which("smartctl") is None:
|
"""Per-drive SMART health + wear/runtime stats (see core/drives.py).
|
||||||
return [Finding(
|
|
||||||
INFO, "Storage", "SMART not checked (smartmontools missing)",
|
Uses the session's elevated collection when present (GUI launch / pkexec), else reads
|
||||||
"Disk self-health couldn't be read.",
|
smartctl directly — which only returns real data as root, so the unprivileged case yields
|
||||||
"Install it for disk health checks: `sudo apt install smartmontools`",
|
'needs root' info findings pointing at `sudo rigdoctor report`.
|
||||||
)]
|
"""
|
||||||
devices = _smart_devices()
|
from . import drives, elevation
|
||||||
if not devices:
|
|
||||||
return [Finding(
|
priv = elevation.privileged()
|
||||||
INFO, "Storage", "SMART: couldn't enumerate drives",
|
if priv is not None and priv.get("drives") is not None:
|
||||||
"Reading SMART usually needs root.",
|
return drives.to_findings(drives.from_dicts(priv["drives"]))
|
||||||
"Run: `sudo rigdoctor report`",
|
return drives.to_findings(drives.collect())
|
||||||
)]
|
|
||||||
findings: list[Finding] = []
|
|
||||||
for dev in devices:
|
|
||||||
try:
|
|
||||||
proc = subprocess.run(["smartctl", "-H", dev], capture_output=True, text=True, timeout=15)
|
|
||||||
except (subprocess.SubprocessError, OSError):
|
|
||||||
continue
|
|
||||||
combined = proc.stdout + proc.stderr
|
|
||||||
if "Permission denied" in combined or "requires root" in combined.lower():
|
|
||||||
findings.append(Finding(INFO, "Storage", f"SMART for {dev} needs root", "", "Run: `sudo rigdoctor report`"))
|
|
||||||
elif "PASSED" in combined:
|
|
||||||
findings.append(Finding(OK, "Storage", f"SMART OK: {dev}", "Overall-health self-assessment passed."))
|
|
||||||
elif "FAILED" in combined or "FAILING_NOW" in combined:
|
|
||||||
findings.append(Finding(CRITICAL, "Storage", f"SMART FAILED: {dev}", "The drive reports failing health.", "Back up now and replace the drive."))
|
|
||||||
return findings
|
|
||||||
|
|
||||||
|
|
||||||
def check_live_temps() -> list[Finding]:
|
def check_live_temps() -> list[Finding]:
|
||||||
@@ -251,27 +295,97 @@ def check_live_temps() -> list[Finding]:
|
|||||||
)]
|
)]
|
||||||
|
|
||||||
|
|
||||||
|
def check_pcie_links() -> list[Finding]:
|
||||||
|
"""Flag NVMe drives linked below their PCIe capability — a slower slot or, most often,
|
||||||
|
motherboard lane-sharing where a GPU/second card or another M.2 steals lanes from the slot.
|
||||||
|
|
||||||
|
Width reductions are reliable (reported as warnings); speed-only reductions are info (they can
|
||||||
|
also be normal link power management at idle). The GPU is intentionally not checked here:
|
||||||
|
NVIDIA drops its PCIe gen *and* width at idle, so a point-in-time snapshot is misleading.
|
||||||
|
"""
|
||||||
|
from . import inventory
|
||||||
|
|
||||||
|
findings: list[Finding] = []
|
||||||
|
for name, dev in inventory.nvme_controllers():
|
||||||
|
cur_g, cur_w, max_g, max_w = inventory.read_link(dev)
|
||||||
|
if not cur_g or not max_g:
|
||||||
|
continue
|
||||||
|
if max_w and cur_w and cur_w != max_w: # fewer lanes → almost always lane-sharing
|
||||||
|
findings.append(Finding(
|
||||||
|
WARNING, "PCIe", f"{name} linked at x{cur_w} (supports x{max_w})",
|
||||||
|
f"{name} negotiated PCIe Gen{cur_g} x{cur_w}, but the drive supports "
|
||||||
|
f"Gen{max_g} x{max_w}. Fewer lanes is usually motherboard lane-sharing — a GPU or a "
|
||||||
|
"second card in a PCIe slot, or another populated M.2, can steal lanes from this slot.",
|
||||||
|
"Check your board manual's lane-sharing table; move the drive to a full-x4 "
|
||||||
|
"(often CPU-attached) M.2 slot."))
|
||||||
|
elif cur_g < max_g: # full width but a lower generation → slower slot or idle ASPM
|
||||||
|
findings.append(Finding(
|
||||||
|
INFO, "PCIe", f"{name} linked at Gen{cur_g} (supports Gen{max_g})",
|
||||||
|
f"{name} negotiated PCIe Gen{cur_g} but supports Gen{max_g}. This can be a slower "
|
||||||
|
"(chipset or older) M.2 slot, or normal link power management (ASPM) at idle.",
|
||||||
|
"If you expect full speed, check the slot and the BIOS PCIe/ASPM settings."))
|
||||||
|
return findings
|
||||||
|
|
||||||
|
|
||||||
|
def check_displays() -> list[Finding]:
|
||||||
|
"""Flag monitors running below their max refresh rate at the current resolution — e.g. a
|
||||||
|
165 Hz panel set to 60 Hz, a common and easily-missed gaming setting (read-only suggestion)."""
|
||||||
|
from . import displays
|
||||||
|
|
||||||
|
findings: list[Finding] = []
|
||||||
|
for m in displays.collect():
|
||||||
|
if m.can_go_faster:
|
||||||
|
findings.append(Finding(
|
||||||
|
INFO, "Display",
|
||||||
|
f"{m.connector} at {round(m.refresh)} Hz (supports {round(m.max_refresh)} Hz)",
|
||||||
|
f"{m.name or m.connector} is running at {round(m.refresh)} Hz at "
|
||||||
|
f"{m.width}x{m.height}, but supports {round(m.max_refresh)} Hz at that resolution.",
|
||||||
|
"Raise the refresh rate in your desktop's Display settings (GNOME: Settings → Displays)."))
|
||||||
|
return findings
|
||||||
|
|
||||||
|
|
||||||
|
def check_memory_speed() -> list[Finding]:
|
||||||
|
"""Flag RAM running below its rated speed — i.e. the XMP (Intel) / EXPO (AMD) profile isn't
|
||||||
|
enabled, leaving memory bandwidth on the table. Needs dmidecode (root); silent without it."""
|
||||||
|
from . import elevation, inventory
|
||||||
|
|
||||||
|
priv = elevation.privileged()
|
||||||
|
dmi = priv["dmidecode"] if (priv and priv.get("dmidecode")) else inventory._dmidecode()
|
||||||
|
worst: tuple[int, int] | None = None # (configured, rated) with the biggest gap
|
||||||
|
for m in dmi.get("memory", []):
|
||||||
|
configured, rated = inventory.module_speed(m)
|
||||||
|
if configured and rated and configured < rated:
|
||||||
|
if worst is None or (rated - configured) > (worst[1] - worst[0]):
|
||||||
|
worst = (configured, rated)
|
||||||
|
if worst is None:
|
||||||
|
return []
|
||||||
|
configured, rated = worst
|
||||||
|
return [Finding(
|
||||||
|
INFO, "Memory", f"RAM at {configured} MT/s (rated {rated} MT/s)",
|
||||||
|
f"Memory is running at {configured} MT/s but the modules are rated {rated} MT/s — the "
|
||||||
|
"XMP/EXPO profile isn't enabled, so you're leaving memory bandwidth on the table.",
|
||||||
|
"Enable XMP (Intel) or EXPO (AMD) in your BIOS/UEFI to run at the rated speed.")]
|
||||||
|
|
||||||
|
|
||||||
def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
||||||
"""Run all checks and return findings sorted by severity (worst first).
|
"""Run all checks and return findings sorted by severity (worst first).
|
||||||
|
|
||||||
SMART needs root; if the session collected it via launch elevation, use that
|
Drive SMART and RAM speed need root; if the session collected them via launch elevation,
|
||||||
instead of re-running smartctl (which would just report "needs root").
|
those checks use the cached data instead of re-running (which would just report "needs root").
|
||||||
|
|
||||||
`include_journal=False` skips the 7-day kernel-journal scan — used by the crash
|
`include_journal=False` skips the 7-day kernel-journal scan — used by the crash
|
||||||
analysis, which scans the previous (crashed) boot specifically instead.
|
analysis, which scans the previous (crashed) boot specifically instead.
|
||||||
"""
|
"""
|
||||||
from . import elevation
|
|
||||||
|
|
||||||
findings: list[Finding] = []
|
findings: list[Finding] = []
|
||||||
findings += check_nvidia_driver()
|
findings += check_nvidia_driver()
|
||||||
|
findings += check_nvidia_module()
|
||||||
if include_journal:
|
if include_journal:
|
||||||
findings += check_journal()
|
findings += check_journal()
|
||||||
findings += check_journal_persistence()
|
findings += check_journal_persistence()
|
||||||
priv = elevation.privileged()
|
findings += check_drives()
|
||||||
if priv is not None and priv.get("smart") is not None:
|
|
||||||
findings += [Finding(**d) for d in priv["smart"]]
|
|
||||||
else:
|
|
||||||
findings += check_smart()
|
|
||||||
findings += check_live_temps()
|
findings += check_live_temps()
|
||||||
|
findings += check_pcie_links()
|
||||||
|
findings += check_displays()
|
||||||
|
findings += check_memory_speed() # uses elevation data if present, else dmidecode (root)
|
||||||
findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
|
findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
|
||||||
return findings
|
return findings
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -85,6 +86,35 @@ def _firmware(dmi: dict) -> Section:
|
|||||||
return Section("Firmware", items)
|
return Section("Firmware", items)
|
||||||
|
|
||||||
|
|
||||||
|
# Common DDR5 XMP/EXPO speed grades (MT/s) — used to read a kit's rated speed from its part
|
||||||
|
# number, since with XMP/EXPO off dmidecode only reports the JEDEC base (e.g. 4800).
|
||||||
|
_DDR_SPEEDS = {4800, 5200, 5600, 6000, 6200, 6400, 6600, 6800, 7000, 7200, 7600, 8000, 8200, 8400}
|
||||||
|
|
||||||
|
|
||||||
|
def _mts(value: str) -> int | None:
|
||||||
|
"""Parse a dmidecode speed like '4800 MT/s' (or 'MHz') to its integer MT/s."""
|
||||||
|
m = re.match(r"\s*(\d+)", value or "")
|
||||||
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _rated_from_part(part: str) -> int | None:
|
||||||
|
"""The highest known DDR speed-grade appearing as a 4-digit token in a part number."""
|
||||||
|
grades = [int(n) for n in re.findall(r"(?<!\d)(\d{4})(?!\d)", part or "") if int(n) in _DDR_SPEEDS]
|
||||||
|
return max(grades) if grades else None
|
||||||
|
|
||||||
|
|
||||||
|
def module_speed(m: dict) -> tuple[int | None, int | None]:
|
||||||
|
"""(configured, rated) MT/s for a dmidecode Memory Device.
|
||||||
|
|
||||||
|
Configured = what it's actually running at; rated = the highest of dmidecode's reported max
|
||||||
|
and the part-number speed-grade (so an unapplied XMP/EXPO profile is still detected).
|
||||||
|
"""
|
||||||
|
configured = _mts(m.get("Configured Memory Speed") or m.get("Configured Clock Speed") or m.get("Speed", ""))
|
||||||
|
candidates = [s for s in (_mts(m.get("Speed", "")), _rated_from_part(m.get("Part Number", ""))) if s]
|
||||||
|
rated = max(candidates) if candidates else None
|
||||||
|
return configured, rated
|
||||||
|
|
||||||
|
|
||||||
def _memory(dmi: dict) -> Section:
|
def _memory(dmi: dict) -> Section:
|
||||||
items: list[tuple[str, str]] = []
|
items: list[tuple[str, str]] = []
|
||||||
try:
|
try:
|
||||||
@@ -98,8 +128,12 @@ def _memory(dmi: dict) -> Section:
|
|||||||
if modules:
|
if modules:
|
||||||
items.append(("Modules", str(len(modules))))
|
items.append(("Modules", str(len(modules))))
|
||||||
for i, m in enumerate(modules):
|
for i, m in enumerate(modules):
|
||||||
desc = " · ".join(p for p in (m.get("Size"), m.get("Type"), m.get("Speed"), m.get("Part Number")) if p)
|
configured, rated = module_speed(m)
|
||||||
items.append((f"Slot {i}", desc))
|
speed = f"{configured} MT/s" if configured else m.get("Speed", "")
|
||||||
|
if rated and configured and rated > configured: # XMP/EXPO not applied
|
||||||
|
speed += f" (rated {rated})"
|
||||||
|
parts = (m.get("Size"), m.get("Type"), speed, m.get("Part Number"))
|
||||||
|
items.append((f"Slot {i}", " · ".join(p for p in parts if p)))
|
||||||
elif shutil.which("dmidecode"):
|
elif shutil.which("dmidecode"):
|
||||||
items.append(("Modules", "run with admin for module details"))
|
items.append(("Modules", "run with admin for module details"))
|
||||||
return Section("Memory", items)
|
return Section("Memory", items)
|
||||||
@@ -123,6 +157,64 @@ def _gpu() -> Section:
|
|||||||
return Section("GPU", [("Device", g) for g in gpus] or [("Device", "unknown")])
|
return Section("GPU", [("Device", g) for g in gpus] or [("Device", "unknown")])
|
||||||
|
|
||||||
|
|
||||||
|
# PCIe link speed (GT/s) → generation.
|
||||||
|
_PCIE_GEN = {"2.5": 1, "5": 2, "5.0": 2, "8": 3, "8.0": 3, "16": 4, "16.0": 4, "32": 5, "32.0": 5}
|
||||||
|
|
||||||
|
|
||||||
|
def _gen(speed: str) -> int | None:
|
||||||
|
"""Map a sysfs link speed like '16.0 GT/s PCIe' to its PCIe generation (4)."""
|
||||||
|
tok = speed.strip().split()[0] if speed.strip() else ""
|
||||||
|
return _PCIE_GEN.get(tok)
|
||||||
|
|
||||||
|
|
||||||
|
def read_link(dev: Path) -> tuple[int | None, str, int | None, str]:
|
||||||
|
"""Negotiated/max PCIe link for a PCI device dir: (cur_gen, cur_width, max_gen, max_width).
|
||||||
|
|
||||||
|
Widths are the raw sysfs strings (e.g. '4'); gens are ints (4) or None when unreadable.
|
||||||
|
"""
|
||||||
|
def rd(name: str) -> str:
|
||||||
|
try:
|
||||||
|
return (dev / name).read_text().strip()
|
||||||
|
except OSError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return (_gen(rd("current_link_speed")), rd("current_link_width"),
|
||||||
|
_gen(rd("max_link_speed")), rd("max_link_width"))
|
||||||
|
|
||||||
|
|
||||||
|
def _link_desc(dev: Path) -> str:
|
||||||
|
"""Describe a PCI device's negotiated PCIe link, noting if it's below its max.
|
||||||
|
|
||||||
|
e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a
|
||||||
|
slower slot.
|
||||||
|
"""
|
||||||
|
cur_g, cur_w, max_g, max_w = read_link(dev)
|
||||||
|
if not cur_g or not cur_w:
|
||||||
|
return ""
|
||||||
|
desc = f"PCIe Gen{cur_g} x{cur_w}"
|
||||||
|
if max_g and (cur_g < max_g or (max_w and cur_w != max_w)):
|
||||||
|
desc += f" (capable of Gen{max_g} x{max_w})"
|
||||||
|
return desc
|
||||||
|
|
||||||
|
|
||||||
|
def nvme_controllers() -> list[tuple[str, Path]]:
|
||||||
|
"""Each NVMe controller as (name, pci-device-dir), e.g. ('nvme0', /sys/.../device)."""
|
||||||
|
base = Path("/sys/class/nvme")
|
||||||
|
try:
|
||||||
|
entries = [p for p in base.iterdir() if re.fullmatch(r"nvme\d+", p.name)]
|
||||||
|
except OSError:
|
||||||
|
return []
|
||||||
|
return sorted((p.name, p / "device") for p in entries)
|
||||||
|
|
||||||
|
|
||||||
|
def _nvme_link(block_name: str) -> str:
|
||||||
|
"""PCIe link for an NVMe block device (nvme0n1 → controller nvme0); '' for non-NVMe."""
|
||||||
|
m = re.match(r"(nvme\d+)", block_name)
|
||||||
|
if not m:
|
||||||
|
return ""
|
||||||
|
return _link_desc(Path("/sys/class/nvme") / m.group(1) / "device")
|
||||||
|
|
||||||
|
|
||||||
def _storage() -> Section:
|
def _storage() -> Section:
|
||||||
items: list[tuple[str, str]] = []
|
items: list[tuple[str, str]] = []
|
||||||
# TYPE first so MODEL (which can contain spaces) is the trailing field.
|
# TYPE first so MODEL (which can contain spaces) is the trailing field.
|
||||||
@@ -133,15 +225,27 @@ def _storage() -> Section:
|
|||||||
continue
|
continue
|
||||||
name, size = parts[1], parts[2]
|
name, size = parts[1], parts[2]
|
||||||
model = parts[3] if len(parts) > 3 else ""
|
model = parts[3] if len(parts) > 3 else ""
|
||||||
items.append((name, f"{model} ({size})".strip()))
|
desc = f"{model} ({size})".strip()
|
||||||
|
link = _nvme_link(name) # NVMe PCIe gen/width (e.g. Gen4 x4), flags downtrains
|
||||||
|
if link:
|
||||||
|
desc += f" · {link}"
|
||||||
|
items.append((name, desc))
|
||||||
return Section("Storage", items or [("Disks", "unknown")])
|
return Section("Storage", items or [("Disks", "unknown")])
|
||||||
|
|
||||||
|
|
||||||
def _display() -> Section:
|
def _display() -> Section:
|
||||||
return Section("Display", [
|
from . import displays
|
||||||
|
|
||||||
|
items = [
|
||||||
("Session", os.environ.get("XDG_SESSION_TYPE", "unknown")),
|
("Session", os.environ.get("XDG_SESSION_TYPE", "unknown")),
|
||||||
("Desktop", os.environ.get("XDG_CURRENT_DESKTOP") or os.environ.get("DESKTOP_SESSION", "unknown")),
|
("Desktop", os.environ.get("XDG_CURRENT_DESKTOP") or os.environ.get("DESKTOP_SESSION", "unknown")),
|
||||||
])
|
]
|
||||||
|
for m in displays.collect():
|
||||||
|
val = f"{m.width}x{m.height} @ {round(m.refresh)} Hz"
|
||||||
|
if m.can_go_faster:
|
||||||
|
val += f" (supports {round(m.max_refresh)} Hz)"
|
||||||
|
items.append((m.label(), val))
|
||||||
|
return Section("Display", items)
|
||||||
|
|
||||||
|
|
||||||
def _dmidecode() -> dict:
|
def _dmidecode() -> dict:
|
||||||
|
|||||||
@@ -0,0 +1,314 @@
|
|||||||
|
"""Parse a Windows crash dump (``.dmp`` minidump) into text the AI can reason over (M14).
|
||||||
|
|
||||||
|
Linux gamers get these from Windows games running under **Proton/Wine**: the game's
|
||||||
|
crash handler (Crashpad/Breakpad, Unreal/Unity, or Wine itself) writes a binary minidump
|
||||||
|
when the title hard-crashes. The file is binary, so we can't hand it to a model directly —
|
||||||
|
we parse the documented ``MDMP`` streams with stdlib :mod:`struct` (no pip deps, per the
|
||||||
|
core rule) and pull out the parts that actually diagnose a crash:
|
||||||
|
|
||||||
|
* the **exception / crash reason** (e.g. access violation 0xC0000005),
|
||||||
|
* the **faulting module** (which DLL the crash address lands in — ``nvwgf2umx.dll``,
|
||||||
|
``d3d11.dll``, an anticheat, the game's own .exe…),
|
||||||
|
* **OS / CPU** info, and the **loaded module list**.
|
||||||
|
|
||||||
|
If ``minidump_stackwalk`` (Breakpad) or ``minidump-stackwalk`` (rust-minidump) is on PATH,
|
||||||
|
its fuller report is appended best-effort; we never depend on it.
|
||||||
|
|
||||||
|
The result feeds the existing opt-in AI flow (:mod:`ai`) exactly like the sensor findings do.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import struct
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .health import CRITICAL, INFO, Finding
|
||||||
|
|
||||||
|
# --- MDMP on-disk layout (all little-endian, packed) --------------------------------
|
||||||
|
_SIGNATURE = b"MDMP"
|
||||||
|
_HEADER = struct.Struct("<4sIIIIIQ") # sig, ver, n_streams, dir_rva, csum, time, flags
|
||||||
|
_DIRECTORY = struct.Struct("<III") # stream_type, data_size, data_rva
|
||||||
|
_SYSINFO = struct.Struct("<HHHBBIIIII") # arch, lvl, rev, n_cpu, prod, maj, min, build, plat, csd
|
||||||
|
_MODULE_STRIDE = 108 # sizeof(MINIDUMP_MODULE)
|
||||||
|
|
||||||
|
# Stream types we read (MINIDUMP_STREAM_TYPE).
|
||||||
|
_MODULE_LIST = 4
|
||||||
|
_EXCEPTION = 6
|
||||||
|
_SYSTEM_INFO = 7
|
||||||
|
_COMMENT_A = 10
|
||||||
|
_COMMENT_W = 11
|
||||||
|
|
||||||
|
_ARCH = {0: "x86", 5: "ARM", 6: "IA-64", 9: "x86-64", 12: "ARM64", 0xFFFF: "unknown"}
|
||||||
|
_PLATFORM = {0x8201: "Linux", 0x8202: "Solaris", 0x8203: "macOS", 0x8204: "iOS",
|
||||||
|
0x8205: "Android", 0x8207: "NaCl"}
|
||||||
|
|
||||||
|
# Common Windows exception (NTSTATUS) codes — what the model needs named, not raw hex.
|
||||||
|
_EXCEPTION_NAMES = {
|
||||||
|
0x80000003: "Breakpoint",
|
||||||
|
0x80000004: "Single step",
|
||||||
|
0xC0000005: "Access violation",
|
||||||
|
0xC0000006: "In-page error",
|
||||||
|
0xC000001D: "Illegal instruction",
|
||||||
|
0xC0000025: "Noncontinuable exception",
|
||||||
|
0xC000008C: "Array bounds exceeded",
|
||||||
|
0xC000008E: "Float divide by zero",
|
||||||
|
0xC0000090: "Float invalid operation",
|
||||||
|
0xC0000094: "Integer divide by zero",
|
||||||
|
0xC0000095: "Integer overflow",
|
||||||
|
0xC0000096: "Privileged instruction",
|
||||||
|
0xC00000FD: "Stack overflow",
|
||||||
|
0xC0000135: "DLL not found",
|
||||||
|
0xC0000142: "DLL initialization failed",
|
||||||
|
0xC0000374: "Heap corruption",
|
||||||
|
0xC0000409: "Stack buffer overrun / fast fail",
|
||||||
|
0xC000041D: "Fatal user-callback exception",
|
||||||
|
0xE06D7363: "C++ exception (MSVC)",
|
||||||
|
}
|
||||||
|
_ACCESS = {0: "reading", 1: "writing", 8: "executing"} # AV ExceptionInformation[0]
|
||||||
|
|
||||||
|
_STACKWALK_BINS = ("minidump_stackwalk", "minidump-stackwalk")
|
||||||
|
_MODULES_SHOWN = 80 # cap the module list so the AI prompt stays bounded
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Module:
|
||||||
|
name: str # basename only
|
||||||
|
base: int
|
||||||
|
size: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MinidumpReport:
|
||||||
|
path: str
|
||||||
|
ok: bool = False
|
||||||
|
error: str = ""
|
||||||
|
crash_reason: str = ""
|
||||||
|
exception_code: int | None = None
|
||||||
|
exception_address: int | None = None
|
||||||
|
faulting_module: str | None = None
|
||||||
|
crashing_thread: int | None = None
|
||||||
|
os_name: str = ""
|
||||||
|
cpu_arch: str = ""
|
||||||
|
cpu_count: int = 0
|
||||||
|
timestamp: int | None = None
|
||||||
|
modules: list[Module] = field(default_factory=list)
|
||||||
|
comment: str = ""
|
||||||
|
stackwalk: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def parse(path, *, run_stackwalk: bool = True) -> MinidumpReport:
|
||||||
|
"""Parse a ``.dmp`` file. Never raises — a bad/unsupported file returns ``ok=False``."""
|
||||||
|
report = MinidumpReport(path=str(path))
|
||||||
|
try:
|
||||||
|
data = Path(path).read_bytes()
|
||||||
|
except OSError as exc:
|
||||||
|
report.error = f"can't read the file: {exc}"
|
||||||
|
return report
|
||||||
|
if len(data) < _HEADER.size or data[:4] != _SIGNATURE:
|
||||||
|
report.error = "not a Windows minidump (missing the 'MDMP' signature)."
|
||||||
|
return report
|
||||||
|
try:
|
||||||
|
_sig, _ver, n_streams, dir_rva, _csum, ts, _flags = _HEADER.unpack_from(data, 0)
|
||||||
|
report.timestamp = ts or None
|
||||||
|
streams = _streams(data, dir_rva, n_streams)
|
||||||
|
_read_system_info(data, streams.get(_SYSTEM_INFO), report)
|
||||||
|
report.modules = _read_modules(data, streams.get(_MODULE_LIST))
|
||||||
|
_read_exception(data, streams.get(_EXCEPTION), report)
|
||||||
|
report.comment = _read_comment(data, streams)
|
||||||
|
except (struct.error, ValueError, IndexError) as exc:
|
||||||
|
report.error = f"the minidump looks corrupt or unsupported: {exc}"
|
||||||
|
return report
|
||||||
|
if report.exception_address is not None:
|
||||||
|
report.faulting_module = _module_at(report.modules, report.exception_address)
|
||||||
|
report.ok = True
|
||||||
|
if run_stackwalk:
|
||||||
|
report.stackwalk = stackwalk(path)
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def _streams(data: bytes, dir_rva: int, n: int) -> dict[int, tuple[int, int]]:
|
||||||
|
"""Map stream_type -> (data_size, data_rva). First occurrence of each type wins."""
|
||||||
|
out: dict[int, tuple[int, int]] = {}
|
||||||
|
for i in range(n):
|
||||||
|
off = dir_rva + i * _DIRECTORY.size
|
||||||
|
if off + _DIRECTORY.size > len(data):
|
||||||
|
break
|
||||||
|
stype, size, rva = _DIRECTORY.unpack_from(data, off)
|
||||||
|
out.setdefault(stype, (size, rva))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _read_system_info(data: bytes, loc, report: MinidumpReport) -> None:
|
||||||
|
if not loc:
|
||||||
|
return
|
||||||
|
_size, rva = loc
|
||||||
|
arch, _lvl, _rev, n_cpu, _prod, major, minor, build, platform, _csd = \
|
||||||
|
_SYSINFO.unpack_from(data, rva)
|
||||||
|
report.cpu_arch = _ARCH.get(arch, f"arch 0x{arch:x}")
|
||||||
|
report.cpu_count = n_cpu
|
||||||
|
if platform == 2: # VER_PLATFORM_WIN32_NT
|
||||||
|
report.os_name = f"Windows {major}.{minor}.{build}"
|
||||||
|
elif platform in _PLATFORM:
|
||||||
|
ver = f" {major}.{minor}.{build}" if (major or minor or build) else ""
|
||||||
|
report.os_name = _PLATFORM[platform] + ver
|
||||||
|
else:
|
||||||
|
report.os_name = f"platform 0x{platform:x} {major}.{minor}.{build}"
|
||||||
|
|
||||||
|
|
||||||
|
def _read_modules(data: bytes, loc) -> list[Module]:
|
||||||
|
if not loc:
|
||||||
|
return []
|
||||||
|
_size, rva = loc
|
||||||
|
(count,) = struct.unpack_from("<I", data, rva)
|
||||||
|
base_off = rva + 4
|
||||||
|
modules: list[Module] = []
|
||||||
|
for i in range(count):
|
||||||
|
rec = base_off + i * _MODULE_STRIDE
|
||||||
|
if rec + _MODULE_STRIDE > len(data):
|
||||||
|
break
|
||||||
|
base, = struct.unpack_from("<Q", data, rec)
|
||||||
|
size, = struct.unpack_from("<I", data, rec + 8)
|
||||||
|
name_rva, = struct.unpack_from("<I", data, rec + 20)
|
||||||
|
modules.append(Module(_read_mdstring(data, name_rva), base, size))
|
||||||
|
return modules
|
||||||
|
|
||||||
|
|
||||||
|
def _read_exception(data: bytes, loc, report: MinidumpReport) -> None:
|
||||||
|
if not loc:
|
||||||
|
return
|
||||||
|
_size, rva = loc
|
||||||
|
thread_id, = struct.unpack_from("<I", data, rva) # MINIDUMP_EXCEPTION_STREAM
|
||||||
|
code, = struct.unpack_from("<I", data, rva + 8) # ExceptionRecord.ExceptionCode
|
||||||
|
address, = struct.unpack_from("<Q", data, rva + 24) # ExceptionRecord.ExceptionAddress
|
||||||
|
n_params, = struct.unpack_from("<I", data, rva + 32)
|
||||||
|
report.crashing_thread = thread_id
|
||||||
|
report.exception_code = code
|
||||||
|
report.exception_address = address
|
||||||
|
report.crash_reason = _describe_exception(data, rva, code, n_params)
|
||||||
|
|
||||||
|
|
||||||
|
def _describe_exception(data: bytes, rva: int, code: int, n_params: int) -> str:
|
||||||
|
name = _EXCEPTION_NAMES.get(code, "Unknown exception")
|
||||||
|
reason = f"{name} (0x{code:08X})"
|
||||||
|
if code in (0xC0000005, 0xC0000006) and n_params >= 2:
|
||||||
|
op = struct.unpack_from("<Q", data, rva + 40)[0] # ExceptionInformation[0]
|
||||||
|
addr = struct.unpack_from("<Q", data, rva + 48)[0] # ExceptionInformation[1]
|
||||||
|
reason += f" {_ACCESS.get(op, 'accessing')} 0x{addr:X}"
|
||||||
|
return reason
|
||||||
|
|
||||||
|
|
||||||
|
def _read_mdstring(data: bytes, rva: int) -> str:
|
||||||
|
"""A MINIDUMP_STRING (u32 byte-length + UTF-16LE), returned as a basename."""
|
||||||
|
if not rva or rva + 4 > len(data):
|
||||||
|
return ""
|
||||||
|
length, = struct.unpack_from("<I", data, rva)
|
||||||
|
start = rva + 4
|
||||||
|
raw = data[start:start + length]
|
||||||
|
text = raw.decode("utf-16-le", "replace").strip("\x00")
|
||||||
|
return text.replace("\\", "/").rsplit("/", 1)[-1] or text
|
||||||
|
|
||||||
|
|
||||||
|
def _read_comment(data: bytes, streams: dict[int, tuple[int, int]]) -> str:
|
||||||
|
if _COMMENT_W in streams:
|
||||||
|
size, rva = streams[_COMMENT_W]
|
||||||
|
return data[rva:rva + size].decode("utf-16-le", "replace").strip("\x00").strip()
|
||||||
|
if _COMMENT_A in streams:
|
||||||
|
size, rva = streams[_COMMENT_A]
|
||||||
|
return data[rva:rva + size].decode("utf-8", "replace").strip("\x00").strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _module_at(modules: list[Module], address: int) -> str | None:
|
||||||
|
for m in modules:
|
||||||
|
if m.base <= address < m.base + m.size:
|
||||||
|
return m.name
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def stackwalk(path, timeout: float = 25.0, max_chars: int = 12000) -> str:
|
||||||
|
"""Best-effort fuller report from an external stackwalker, or '' if none is installed."""
|
||||||
|
exe = next((shutil.which(name) for name in _STACKWALK_BINS if shutil.which(name)), None)
|
||||||
|
if not exe:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
[exe, str(path)], capture_output=True, text=True, timeout=timeout, check=False)
|
||||||
|
except (OSError, subprocess.SubprocessError):
|
||||||
|
return ""
|
||||||
|
return (proc.stdout or "").strip()[:max_chars]
|
||||||
|
|
||||||
|
|
||||||
|
# --- rendering ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
def to_text(report: MinidumpReport) -> str:
|
||||||
|
"""Human-readable structured summary (also shown in the GUI)."""
|
||||||
|
name = Path(report.path).name
|
||||||
|
lines = [f"Crash dump: {name}"]
|
||||||
|
if report.crash_reason:
|
||||||
|
lines.append(f"Crash reason: {report.crash_reason}")
|
||||||
|
if report.faulting_module:
|
||||||
|
lines.append(f"Faulting module: {report.faulting_module}")
|
||||||
|
elif report.exception_address is not None:
|
||||||
|
lines.append(f"Faulting address: 0x{report.exception_address:X} (no module matched)")
|
||||||
|
if report.crashing_thread is not None:
|
||||||
|
lines.append(f"Crashing thread: {report.crashing_thread}")
|
||||||
|
if report.os_name:
|
||||||
|
lines.append(f"OS: {report.os_name}")
|
||||||
|
if report.cpu_arch:
|
||||||
|
cpus = f" ({report.cpu_count} logical)" if report.cpu_count else ""
|
||||||
|
lines.append(f"CPU: {report.cpu_arch}{cpus}")
|
||||||
|
if report.timestamp:
|
||||||
|
lines.append("Captured: " + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(report.timestamp)))
|
||||||
|
if report.modules:
|
||||||
|
shown = report.modules[:_MODULES_SHOWN]
|
||||||
|
more = len(report.modules) - len(shown)
|
||||||
|
lines.append(f"\nLoaded modules ({len(report.modules)}):")
|
||||||
|
lines += [f"- {m.name}" for m in shown if m.name]
|
||||||
|
if more > 0:
|
||||||
|
lines.append(f"- (+{more} more)")
|
||||||
|
if report.comment:
|
||||||
|
lines.append(f"\nDump comment:\n{report.comment[:1000]}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def to_ai_text(report: MinidumpReport) -> str:
|
||||||
|
"""The block sent to the model: Proton/Linux framing + summary + stackwalk."""
|
||||||
|
framing = (
|
||||||
|
"These findings come from a Windows crash minidump (.dmp) produced by a game running "
|
||||||
|
"under Proton/Wine on Linux. The faulting modules are Windows DLLs inside the Proton "
|
||||||
|
"prefix, so the crash is a Windows-process fault but the fixes are Linux/Proton-side "
|
||||||
|
"(Proton version, DXVK/VKD3D, GPU driver, launch options, shader cache) — never Windows "
|
||||||
|
"admin/registry steps."
|
||||||
|
)
|
||||||
|
parts = [framing, "", to_text(report)]
|
||||||
|
if report.stackwalk:
|
||||||
|
parts.append("\nminidump_stackwalk output:\n" + report.stackwalk)
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def to_findings(report: MinidumpReport) -> list[Finding]:
|
||||||
|
"""Render the dump as Finding cards for the GUI (mirrors the health report)."""
|
||||||
|
findings: list[Finding] = []
|
||||||
|
detail_bits = []
|
||||||
|
if report.faulting_module:
|
||||||
|
detail_bits.append(f"in {report.faulting_module}")
|
||||||
|
if report.exception_address is not None:
|
||||||
|
detail_bits.append(f"at 0x{report.exception_address:X}")
|
||||||
|
detail = (report.crash_reason or "Crash recorded")
|
||||||
|
if detail_bits:
|
||||||
|
detail += " " + " ".join(detail_bits) + "."
|
||||||
|
findings.append(Finding(
|
||||||
|
CRITICAL, "Crash dump",
|
||||||
|
f"Crash in {report.faulting_module}" if report.faulting_module else "Crash recorded",
|
||||||
|
detail,
|
||||||
|
"Use “Explain with AI” for likely causes and Proton-side fixes.",
|
||||||
|
))
|
||||||
|
env_bits = [b for b in (report.os_name, report.cpu_arch and f"{report.cpu_arch} CPU") if b]
|
||||||
|
if env_bits:
|
||||||
|
findings.append(Finding(
|
||||||
|
INFO, "Crash dump", "Dump environment", " · ".join(env_bits)))
|
||||||
|
return findings
|
||||||
@@ -0,0 +1,322 @@
|
|||||||
|
"""GPU stress + close thermal monitoring — the repro tool for load-correlated crashes.
|
||||||
|
|
||||||
|
Run a GPU load and sample sensors at a high rate, then report peak/sustained temperatures,
|
||||||
|
how long the GPU spent above each temperature threshold, power headroom vs the limit, whether
|
||||||
|
it throttled, and any GPU fault (Xid / VA-space / a query timeout) that hit during the window.
|
||||||
|
This is the on-demand way to reproduce the "only under load / only certain games" freezes
|
||||||
|
instead of waiting for a game to trigger them.
|
||||||
|
|
||||||
|
The load comes from, in order: an explicit ``command`` (your game, or a loader like gpu-burn),
|
||||||
|
an auto-detected loader on PATH (gpu-burn / vkmark / glmark2 / vkcube), or **monitor-only** when
|
||||||
|
none is found — then you generate the load yourself (launch the game) while this closely tracks
|
||||||
|
temps for the duration.
|
||||||
|
|
||||||
|
Stdlib only. Degrades gracefully: no nvidia-smi → no GPU stats; a loader that won't start →
|
||||||
|
monitor-only with a note; missing journal access → no fault scan, just the telemetry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from . import health
|
||||||
|
from .sample import Sample
|
||||||
|
from .sampler import Sampler
|
||||||
|
from .sources import available_sources
|
||||||
|
|
||||||
|
# Default temperature dwell thresholds (°C). 83 is Ampere's typical thermal-throttle point;
|
||||||
|
# 90+ is hot; sustained 95+ on the core (or 100+ on GDDR6 memory) is a cooling problem.
|
||||||
|
DEFAULT_THRESHOLDS = (80, 85, 90, 95)
|
||||||
|
|
||||||
|
# Known GPU load generators, best (heaviest / most deterministic) first. argv builder takes the
|
||||||
|
# remaining duration so a self-terminating loader (gpu-burn) bounds itself; the windowed
|
||||||
|
# benchmarks loop until we kill them. None are required — detection is best-effort.
|
||||||
|
_LOADERS: list[tuple[str, Callable[[float], list[str]]]] = [
|
||||||
|
("gpu-burn", lambda secs: ["gpu-burn", str(max(1, int(secs)))]),
|
||||||
|
("vkmark", lambda _s: ["vkmark", "--run-forever"]),
|
||||||
|
("glmark2", lambda _s: ["glmark2", "--run-forever"]),
|
||||||
|
("vkcube", lambda _s: ["vkcube"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
# NVML clocks-event bits that mean the clocks are being *held back* (a throttle), decoded from
|
||||||
|
# the active-reasons bitmask so we don't depend on per-field name differences across drivers.
|
||||||
|
_THROTTLE_BITS = {
|
||||||
|
0x008: "HW slowdown",
|
||||||
|
0x020: "SW thermal slowdown",
|
||||||
|
0x040: "HW thermal slowdown",
|
||||||
|
0x080: "HW power-brake slowdown",
|
||||||
|
}
|
||||||
|
_POWERCAP_BIT = 0x004 # hitting the power limit — expected under load, reported separately
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricStat:
|
||||||
|
key: str # e.g. "gpu.temp", "gpu.power", "gpu.clock.core"
|
||||||
|
label: str # human label for the report
|
||||||
|
unit: str
|
||||||
|
min: float
|
||||||
|
avg: float
|
||||||
|
max: float
|
||||||
|
samples: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _Tick:
|
||||||
|
dt: float # seconds this tick represents (for dwell-time weighting)
|
||||||
|
values: dict[str, float] # reading key -> value across all sources (Nones dropped)
|
||||||
|
throttle: list[str] # active throttle reasons this tick
|
||||||
|
power_capped: bool
|
||||||
|
lost: bool # query timeout / no GPU response this tick
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StressResult:
|
||||||
|
load: str # "command: …" | "auto: gpu-burn" | "monitor-only"
|
||||||
|
duration: float # seconds actually monitored
|
||||||
|
samples: int
|
||||||
|
interval: float
|
||||||
|
stats: list[MetricStat] = field(default_factory=list)
|
||||||
|
peak_temp: float | None = None
|
||||||
|
peak_mem_temp: float | None = None
|
||||||
|
avg_temp: float | None = None
|
||||||
|
time_above: dict[int, float] = field(default_factory=dict) # threshold °C -> seconds at/above
|
||||||
|
max_power: float | None = None
|
||||||
|
power_limit: float | None = None
|
||||||
|
power_capped: bool = False
|
||||||
|
throttled: bool = False
|
||||||
|
throttle_reasons: list[str] = field(default_factory=list)
|
||||||
|
gpu_lost: bool = False
|
||||||
|
faults: list[str] = field(default_factory=list) # Xid/VA-space titles in the window
|
||||||
|
aborted: bool = False # Ctrl-C or the load exited early
|
||||||
|
severity: str = health.OK
|
||||||
|
verdict: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# --- load resolution ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def available_loaders() -> list[str]:
|
||||||
|
"""Known GPU load tools found on PATH (heaviest first)."""
|
||||||
|
return [name for name, _ in _LOADERS if shutil.which(name)]
|
||||||
|
|
||||||
|
|
||||||
|
def _start_load(command: list[str] | None, duration: float) -> tuple[subprocess.Popen | None, str]:
|
||||||
|
"""Start the load process and return (proc, description). proc is None for monitor-only."""
|
||||||
|
if command:
|
||||||
|
try:
|
||||||
|
proc = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||||
|
return proc, "command: " + " ".join(command)
|
||||||
|
except (OSError, ValueError) as exc:
|
||||||
|
return None, f"monitor-only (command failed to start: {exc})"
|
||||||
|
for name, build in _LOADERS:
|
||||||
|
if shutil.which(name):
|
||||||
|
try:
|
||||||
|
proc = subprocess.Popen(build(duration), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||||
|
return proc, f"auto: {name}"
|
||||||
|
except (OSError, ValueError):
|
||||||
|
continue
|
||||||
|
return None, "monitor-only"
|
||||||
|
|
||||||
|
|
||||||
|
def _stop_load(proc: subprocess.Popen | None) -> None:
|
||||||
|
if proc is None or proc.poll() is not None:
|
||||||
|
return
|
||||||
|
proc.terminate()
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=5)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
|
||||||
|
# --- throttle / fault probes ----------------------------------------------------------
|
||||||
|
|
||||||
|
def _throttle_state() -> tuple[list[str], bool]:
|
||||||
|
"""(active throttle reasons, power-capped) decoded from the clocks-event bitmask."""
|
||||||
|
if shutil.which("nvidia-smi") is None:
|
||||||
|
return [], False
|
||||||
|
raw = ""
|
||||||
|
for field_name in ("clocks_event_reasons.active", "clocks_throttle_reasons.active"):
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
["nvidia-smi", f"--query-gpu={field_name}", "--format=csv,noheader"],
|
||||||
|
capture_output=True, text=True, timeout=5,
|
||||||
|
)
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
continue
|
||||||
|
raw = proc.stdout.strip().splitlines()[0].strip() if proc.stdout.strip() else ""
|
||||||
|
if raw and raw.lower() not in ("n/a", "not supported", "[n/a]"):
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
bits = int(raw, 16)
|
||||||
|
except ValueError:
|
||||||
|
return [], False
|
||||||
|
reasons = [name for bit, name in _THROTTLE_BITS.items() if bits & bit]
|
||||||
|
return reasons, bool(bits & _POWERCAP_BIT)
|
||||||
|
|
||||||
|
|
||||||
|
def _faults_since(start_ts: float) -> list[str]:
|
||||||
|
"""Titles of GPU/PCIe/hardware faults logged to the kernel journal since the run began."""
|
||||||
|
out = health._journalctl(["-k", "--no-pager", "-o", "cat", "--since", f"@{int(start_ts)}"])
|
||||||
|
if not out:
|
||||||
|
return []
|
||||||
|
return [f.title for f in health.scan_journal_text(out)
|
||||||
|
if f.category in ("GPU", "PCIe", "Hardware", "Kernel")]
|
||||||
|
|
||||||
|
|
||||||
|
def _tick_values(sample: Sample) -> tuple[dict[str, float], bool]:
|
||||||
|
"""Reading key -> value across all sources (Nones dropped), plus whether the GPU
|
||||||
|
failed to respond (an nvidia-smi query timeout — a hang/lost signal)."""
|
||||||
|
values: dict[str, float] = {}
|
||||||
|
lost = False
|
||||||
|
for r in sample.readings:
|
||||||
|
if r.source == "gpu" and r.metric == "status" and r.label == "query-timeout":
|
||||||
|
lost = True
|
||||||
|
if r.value is not None:
|
||||||
|
values[r.key] = r.value
|
||||||
|
return values, lost
|
||||||
|
|
||||||
|
|
||||||
|
# --- pure analysis (unit-testable, no IO) ---------------------------------------------
|
||||||
|
|
||||||
|
_REPORT_KEYS = {
|
||||||
|
"gpu.temp": ("GPU core temp", "°C"),
|
||||||
|
"gpu.temp.memory": ("GPU memory temp", "°C"),
|
||||||
|
"gpu.power": ("GPU power", "W"),
|
||||||
|
"gpu.util": ("GPU utilization", "%"),
|
||||||
|
"gpu.mem_util": ("VRAM controller util", "%"),
|
||||||
|
"gpu.clock.core": ("Core clock", "MHz"),
|
||||||
|
"gpu.clock.memory": ("Memory clock", "MHz"),
|
||||||
|
"gpu.fan": ("Fan", "%"),
|
||||||
|
"gpu.mem_used": ("VRAM used", "MiB"),
|
||||||
|
"cpu.temp": ("CPU temp", "°C"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def summarize(ticks: list[_Tick], *, load: str, interval: float, faults: list[str],
|
||||||
|
thresholds=DEFAULT_THRESHOLDS) -> StressResult:
|
||||||
|
"""Build a StressResult from collected ticks — pure, so it's tested with synthetic input."""
|
||||||
|
duration = sum(t.dt for t in ticks)
|
||||||
|
result = StressResult(load=load, duration=round(duration, 1), samples=len(ticks),
|
||||||
|
interval=interval, faults=faults)
|
||||||
|
|
||||||
|
series: dict[str, list[float]] = {}
|
||||||
|
throttle_seen: set[str] = set()
|
||||||
|
time_above = {th: 0.0 for th in thresholds}
|
||||||
|
for t in ticks:
|
||||||
|
for key, value in t.values.items():
|
||||||
|
series.setdefault(key, []).append(value)
|
||||||
|
throttle_seen.update(t.throttle)
|
||||||
|
if t.power_capped:
|
||||||
|
result.power_capped = True
|
||||||
|
if t.lost:
|
||||||
|
result.gpu_lost = True
|
||||||
|
core = t.values.get("gpu.temp")
|
||||||
|
if core is not None:
|
||||||
|
for th in thresholds:
|
||||||
|
if core >= th:
|
||||||
|
time_above[th] += t.dt
|
||||||
|
|
||||||
|
for key, (label, unit) in _REPORT_KEYS.items():
|
||||||
|
vals = series.get(key)
|
||||||
|
if not vals:
|
||||||
|
continue
|
||||||
|
stat = MetricStat(key, label, unit, round(min(vals), 1),
|
||||||
|
round(sum(vals) / len(vals), 1), round(max(vals), 1), len(vals))
|
||||||
|
result.stats.append(stat)
|
||||||
|
if key == "gpu.temp":
|
||||||
|
result.peak_temp, result.avg_temp = stat.max, stat.avg
|
||||||
|
elif key == "gpu.temp.memory":
|
||||||
|
result.peak_mem_temp = stat.max
|
||||||
|
elif key == "gpu.power":
|
||||||
|
result.max_power = stat.max
|
||||||
|
|
||||||
|
# power_limit isn't a reported metric (it's ~constant); pull it from the raw series.
|
||||||
|
if "gpu.power_limit" in series:
|
||||||
|
result.power_limit = max(series["gpu.power_limit"])
|
||||||
|
|
||||||
|
result.throttle_reasons = sorted(throttle_seen)
|
||||||
|
result.throttled = bool(throttle_seen)
|
||||||
|
result.time_above = {th: round(secs, 1) for th, secs in time_above.items() if secs > 0}
|
||||||
|
|
||||||
|
_verdict(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _verdict(r: StressResult) -> None:
|
||||||
|
"""Set severity + a plain-language conclusion from the gathered signals."""
|
||||||
|
peak = f"{r.peak_temp:.0f}°C" if r.peak_temp is not None else "?"
|
||||||
|
if r.gpu_lost or any(t for t in r.faults):
|
||||||
|
r.severity = health.CRITICAL
|
||||||
|
cause = "; ".join(r.faults) if r.faults else "the GPU stopped responding (query timeout)"
|
||||||
|
r.verdict = (f"GPU fault during the stress run: {cause}. This reproduces the crash under "
|
||||||
|
f"load — capture/keep these logs. Peak core temp {peak}.")
|
||||||
|
return
|
||||||
|
if r.throttled:
|
||||||
|
r.severity = health.WARNING
|
||||||
|
r.verdict = (f"Thermal/HW throttling detected ({', '.join(r.throttle_reasons)}) — the GPU "
|
||||||
|
f"held clocks back to stay safe. Peak core temp {peak}. Improve cooling/airflow.")
|
||||||
|
return
|
||||||
|
if r.peak_temp is not None and r.peak_temp >= 90:
|
||||||
|
r.severity = health.WARNING
|
||||||
|
r.verdict = (f"No fault, but the core peaked at {peak} — hot. Watch GDDR6/VRM cooling; "
|
||||||
|
"sustained high temps shorten the card's life and precede instability.")
|
||||||
|
return
|
||||||
|
if r.peak_temp is None:
|
||||||
|
r.severity = health.INFO
|
||||||
|
r.verdict = "No GPU telemetry was captured (nvidia-smi unavailable?)."
|
||||||
|
return
|
||||||
|
capped = " (power-limited — hitting the cap, which is normal)" if r.power_capped else ""
|
||||||
|
r.verdict = f"Stable: peaked at {peak} with no faults or throttling{capped}."
|
||||||
|
|
||||||
|
|
||||||
|
# --- the run loop (IO) ----------------------------------------------------------------
|
||||||
|
|
||||||
|
def run(duration: float = 120.0, interval: float = 0.5, command: list[str] | None = None,
|
||||||
|
thresholds=DEFAULT_THRESHOLDS, on_tick: Callable[[Sample, float], None] | None = None,
|
||||||
|
should_stop: Callable[[], bool] | None = None) -> StressResult:
|
||||||
|
"""Drive a GPU load for ``duration`` seconds, sampling every ``interval``, and report.
|
||||||
|
|
||||||
|
Stops early on Ctrl-C, if a GPU query times out (likely hang), if the load process exits, or
|
||||||
|
when ``should_stop()`` returns True (the GUI's Stop button). ``on_tick(sample, elapsed)`` is
|
||||||
|
called each tick for live display.
|
||||||
|
"""
|
||||||
|
sampler = Sampler(available_sources())
|
||||||
|
proc, load_desc = _start_load(command, duration)
|
||||||
|
start = time.monotonic()
|
||||||
|
start_ts = time.time()
|
||||||
|
ticks: list[_Tick] = []
|
||||||
|
last = start
|
||||||
|
aborted = False
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
sample = sampler.sample()
|
||||||
|
now = time.monotonic()
|
||||||
|
dt = now - last
|
||||||
|
last = now
|
||||||
|
values, lost = _tick_values(sample)
|
||||||
|
reasons, capped = _throttle_state()
|
||||||
|
ticks.append(_Tick(dt=dt, values=values, throttle=reasons, power_capped=capped, lost=lost))
|
||||||
|
if on_tick is not None:
|
||||||
|
on_tick(sample, now - start)
|
||||||
|
if lost: # GPU stopped responding — stop now, it may be hung/lost
|
||||||
|
break
|
||||||
|
if should_stop is not None and should_stop(): # GUI Stop button
|
||||||
|
aborted = True
|
||||||
|
break
|
||||||
|
if proc is not None and proc.poll() is not None: # the load finished/exited
|
||||||
|
break
|
||||||
|
if (now - start) >= duration:
|
||||||
|
break
|
||||||
|
time.sleep(max(0.0, interval - (time.monotonic() - now)))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
aborted = True
|
||||||
|
finally:
|
||||||
|
_stop_load(proc)
|
||||||
|
|
||||||
|
faults = _faults_since(start_ts)
|
||||||
|
result = summarize(ticks, load=load_desc, interval=interval, faults=faults, thresholds=thresholds)
|
||||||
|
result.aborted = aborted or (proc is not None and command is not None and result.duration < duration - interval)
|
||||||
|
return result
|
||||||
@@ -8,11 +8,14 @@ state for the UI; `apply_update` performs the no-root self-update.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import functools
|
||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from .. import __version__
|
from .. import __version__
|
||||||
from ..config import load_token
|
from ..config import load_token
|
||||||
@@ -31,6 +34,50 @@ UP_TO_DATE = "up-to-date"
|
|||||||
AVAILABLE = "available"
|
AVAILABLE = "available"
|
||||||
|
|
||||||
|
|
||||||
|
APT_PACKAGE = "rigdoctor"
|
||||||
|
|
||||||
|
|
||||||
|
def _dpkg_owns(path: Path) -> bool:
|
||||||
|
"""True if dpkg reports `path` belongs to a package (i.e. an apt/.deb install)."""
|
||||||
|
if not shutil.which("dpkg"):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
r = subprocess.run(["dpkg", "-S", str(path)], capture_output=True, text=True, timeout=5)
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
return False
|
||||||
|
return r.returncode == 0 and APT_PACKAGE in r.stdout
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=1)
|
||||||
|
def install_kind() -> str:
|
||||||
|
"""How RigDoctor was installed: 'apt' (.deb), 'pip' (venv/.run), or 'dev' (source checkout).
|
||||||
|
|
||||||
|
Decides which updater to use: only 'pip' can self-update in place; apt is root/dpkg-managed
|
||||||
|
and source is VCS-managed, so those are guided rather than auto-applied.
|
||||||
|
"""
|
||||||
|
pkg = Path(__file__).resolve().parents[1] # .../rigdoctor
|
||||||
|
if _dpkg_owns(pkg / "__init__.py"):
|
||||||
|
return "apt"
|
||||||
|
if sys.prefix != sys.base_prefix: # inside a venv → the pip/.run install
|
||||||
|
return "pip"
|
||||||
|
if (pkg.parents[1] / "pyproject.toml").exists(): # repo checkout
|
||||||
|
return "dev"
|
||||||
|
if str(pkg).startswith("/usr/") or "/dist-packages/" in str(pkg):
|
||||||
|
return "apt" # system-managed but no dpkg record — still don't pip
|
||||||
|
return "pip"
|
||||||
|
|
||||||
|
|
||||||
|
def update_hint(kind: str | None = None) -> str:
|
||||||
|
"""Human guidance for installs that can't self-update via pip (apt / source)."""
|
||||||
|
kind = kind or install_kind()
|
||||||
|
if kind == "apt":
|
||||||
|
return ("Installed via apt — update with:\n"
|
||||||
|
f" sudo apt update && sudo apt install --only-upgrade {APT_PACKAGE}")
|
||||||
|
if kind == "dev":
|
||||||
|
return "Running from a source checkout — update with `git pull`."
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def _parse(version: str) -> tuple[int, ...]:
|
def _parse(version: str) -> tuple[int, ...]:
|
||||||
return tuple(int(p) for p in version.lstrip("vV").split(".") if p.isdigit())
|
return tuple(int(p) for p in version.lstrip("vV").split(".") if p.isdigit())
|
||||||
|
|
||||||
@@ -100,11 +147,16 @@ def list_releases(limit: int = 15, timeout: float = 6.0) -> tuple[list[tuple[str
|
|||||||
|
|
||||||
|
|
||||||
def apply_update(tag: str) -> tuple[int, str]:
|
def apply_update(tag: str) -> tuple[int, str]:
|
||||||
"""Self-update the current (user-local) install to `tag` via authenticated pip.
|
"""Update to `tag` using the method matching how RigDoctor was installed.
|
||||||
|
|
||||||
Installs `rigdoctor[gui] @ git+https://oauth2:<token>@…/rigdoctor.git@<tag>` into
|
Only pip/venv installs are upgraded in place (authenticated pip install of
|
||||||
the running environment. Returns (exit_code, output) with the token scrubbed.
|
`rigdoctor[gui] @ git+https://oauth2:<token>@…/rigdoctor.git@<tag>`). apt and source
|
||||||
|
installs can't be (root/dpkg- or VCS-managed), so they return guidance instead of
|
||||||
|
attempting pip. Returns (exit_code, output) with the token scrubbed.
|
||||||
"""
|
"""
|
||||||
|
kind = install_kind()
|
||||||
|
if kind != "pip":
|
||||||
|
return (1, update_hint(kind))
|
||||||
token = load_token()
|
token = load_token()
|
||||||
if not token:
|
if not token:
|
||||||
return (1, "No update token configured. Run `rigdoctor login`.")
|
return (1, "No update token configured. Run `rigdoctor login`.")
|
||||||
|
|||||||
@@ -40,16 +40,20 @@ def launch_option() -> str:
|
|||||||
return f"{quoted} wrap %command%"
|
return f"{quoted} wrap %command%"
|
||||||
|
|
||||||
|
|
||||||
def run(command: list[str]) -> int:
|
def run(command: list[str], game: str | None = None) -> int:
|
||||||
"""Start a focused capture (unless one's already running), run the game, then stop it.
|
"""Start a focused capture (unless one's already running), run the game, then stop it.
|
||||||
Returns the game's exit code so Steam sees the right status."""
|
Returns the game's exit code so Steam sees the right status.
|
||||||
|
|
||||||
|
`game` overrides name detection — used by `games play` for a custom game (e.g. SPT), where
|
||||||
|
there's no SteamAppId and the bare script name (tarkov.sh) wouldn't tag the capture usefully.
|
||||||
|
"""
|
||||||
from . import diagnostic, reccontrol
|
from . import diagnostic, reccontrol
|
||||||
|
|
||||||
if not command:
|
if not command:
|
||||||
print("usage: rigdoctor wrap %command% (set as a Steam launch option)", file=sys.stderr)
|
print("usage: rigdoctor wrap %command% (set as a Steam launch option)", file=sys.stderr)
|
||||||
return 2
|
return 2
|
||||||
|
|
||||||
game = game_name_from_env() or os.path.basename(command[0])
|
game = game or game_name_from_env() or os.path.basename(command[0])
|
||||||
started = False
|
started = False
|
||||||
if not reccontrol.running_pid(): # don't disturb an existing capture
|
if not reccontrol.running_pid(): # don't disturb an existing capture
|
||||||
started = diagnostic.start(game=game) is not None
|
started = diagnostic.start(game=game) is not None
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ class DiagnosticDialog(QDialog):
|
|||||||
lines.append("\nCapture summary:\n" + render_summary(summary))
|
lines.append("\nCapture summary:\n" + render_summary(summary))
|
||||||
|
|
||||||
since = (summary.start - 60) if summary.start else None
|
since = (summary.start - 60) if summary.start else None
|
||||||
logs = gamelogs.collect(since=since) # scoped to this session
|
logs = gamelogs.collect(since=since, game=result.game) # scoped to this session
|
||||||
if logs:
|
if logs:
|
||||||
lines.append("\nGame/Proton/Steam logs for this session:\n" + logs)
|
lines.append("\nGame/Proton/Steam logs for this session:\n" + logs)
|
||||||
sys_logs = syslogs.collect(since=since) # kernel log + crashed-process records
|
sys_logs = syslogs.collect(since=since) # kernel log + crashed-process records
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from PySide6.QtWidgets import (
|
|||||||
QApplication,
|
QApplication,
|
||||||
QCheckBox,
|
QCheckBox,
|
||||||
QDialog,
|
QDialog,
|
||||||
|
QFileDialog,
|
||||||
QFrame,
|
QFrame,
|
||||||
QHBoxLayout,
|
QHBoxLayout,
|
||||||
QLabel,
|
QLabel,
|
||||||
@@ -29,6 +30,7 @@ from PySide6.QtWidgets import (
|
|||||||
|
|
||||||
from ..config import load_config, update_config
|
from ..config import load_config, update_config
|
||||||
from .diagnostic_dialog import DiagnosticDialog
|
from .diagnostic_dialog import DiagnosticDialog
|
||||||
|
from .minidump_dialog import MinidumpDialog
|
||||||
from .theme import ACCENT, GOOD, MUTED, WARN
|
from .theme import ACCENT, GOOD, MUTED, WARN
|
||||||
|
|
||||||
|
|
||||||
@@ -79,6 +81,7 @@ class GamesPage(QWidget):
|
|||||||
_scanned = Signal(object) # steam.ScanResult
|
_scanned = Signal(object) # steam.ScanResult
|
||||||
new_count_changed = Signal(int) # newly-installed game count (for the nav badge)
|
new_count_changed = Signal(int) # newly-installed game count (for the nav badge)
|
||||||
_diag_done = Signal(object) # DiagnosticResult — focused capture analyzed
|
_diag_done = Signal(object) # DiagnosticResult — focused capture analyzed
|
||||||
|
_dump_parsed = Signal(object) # minidump.MinidumpReport — imported .dmp (or None)
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -86,6 +89,7 @@ class GamesPage(QWidget):
|
|||||||
self._libraries_ready.connect(self._render_libraries)
|
self._libraries_ready.connect(self._render_libraries)
|
||||||
self._scanned.connect(self._render_games)
|
self._scanned.connect(self._render_games)
|
||||||
self._diag_done.connect(self._on_diag_done)
|
self._diag_done.connect(self._on_diag_done)
|
||||||
|
self._dump_parsed.connect(self._on_dump_parsed)
|
||||||
self._busy = False
|
self._busy = False
|
||||||
self._new_appids: set[str] = set()
|
self._new_appids: set[str] = set()
|
||||||
self._extra_games: list = [] # non-Steam (Lutris/Heroic), appended after a scan
|
self._extra_games: list = [] # non-Steam (Lutris/Heroic), appended after a scan
|
||||||
@@ -103,9 +107,18 @@ class GamesPage(QWidget):
|
|||||||
self._status = QLabel("")
|
self._status = QLabel("")
|
||||||
self._status.setObjectName("Muted")
|
self._status.setObjectName("Muted")
|
||||||
header.addWidget(self._status)
|
header.addWidget(self._status)
|
||||||
|
# Import a Windows crash dump (.dmp) from a Proton game and analyze it with AI.
|
||||||
|
# Shown only when an AI provider is configured (AI analysis is the point).
|
||||||
|
self._import_btn = QPushButton("Import crash dump…")
|
||||||
|
self._import_btn.clicked.connect(self._import_dump)
|
||||||
|
header.addWidget(self._import_btn)
|
||||||
self._autocap_btn = QPushButton("Auto-capture…")
|
self._autocap_btn = QPushButton("Auto-capture…")
|
||||||
self._autocap_btn.clicked.connect(self._show_autocapture)
|
self._autocap_btn.clicked.connect(self._show_autocapture)
|
||||||
header.addWidget(self._autocap_btn)
|
header.addWidget(self._autocap_btn)
|
||||||
|
# Add a game no launcher reports (e.g. SPT / standalone mod launchers).
|
||||||
|
self._add_btn = QPushButton("Add game…")
|
||||||
|
self._add_btn.clicked.connect(self._add_custom_game)
|
||||||
|
header.addWidget(self._add_btn)
|
||||||
self._rescan_btn = QPushButton("Rescan")
|
self._rescan_btn = QPushButton("Rescan")
|
||||||
self._rescan_btn.setObjectName("PrimaryButton")
|
self._rescan_btn.setObjectName("PrimaryButton")
|
||||||
self._rescan_btn.clicked.connect(self.refresh)
|
self._rescan_btn.clicked.connect(self.refresh)
|
||||||
@@ -192,6 +205,7 @@ class GamesPage(QWidget):
|
|||||||
self._load_cached() # instant display from the last scan
|
self._load_cached() # instant display from the last scan
|
||||||
QTimer.singleShot(400, self.refresh) # then rescan in the background on launch
|
QTimer.singleShot(400, self.refresh) # then rescan in the background on launch
|
||||||
self._check_crash() # surface an interrupted (crashed) diagnostic
|
self._check_crash() # surface an interrupted (crashed) diagnostic
|
||||||
|
self._refresh_import_btn() # show Import only if AI is configured
|
||||||
|
|
||||||
# --- loading ----------------------------------------------------------------------
|
# --- loading ----------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -225,7 +239,9 @@ class GamesPage(QWidget):
|
|||||||
]
|
]
|
||||||
self._libraries_ready.emit(libs)
|
self._libraries_ready.emit(libs)
|
||||||
try:
|
try:
|
||||||
self._extra_games = launchers.scan() # Lutris / Heroic (non-Steam)
|
from ..core import customgames
|
||||||
|
# non-Steam: Lutris/Heroic + user-added games (SPT etc.)
|
||||||
|
self._extra_games = list(launchers.scan()) + customgames.scan()
|
||||||
except Exception:
|
except Exception:
|
||||||
self._extra_games = []
|
self._extra_games = []
|
||||||
self._scanned.emit(steam.rescan())
|
self._scanned.emit(steam.rescan())
|
||||||
@@ -413,6 +429,83 @@ class GamesPage(QWidget):
|
|||||||
reccontrol.stop_background()
|
reccontrol.stop_background()
|
||||||
self._banner.hide()
|
self._banner.hide()
|
||||||
|
|
||||||
|
def _add_custom_game(self) -> None:
|
||||||
|
"""Manually add a game no launcher reports (e.g. SPT): name + an optional launch
|
||||||
|
command/script (so it can be launched under crash-capture) and log folder."""
|
||||||
|
from ..core import customgames
|
||||||
|
|
||||||
|
dlg = QDialog(self)
|
||||||
|
dlg.setWindowTitle("Add game")
|
||||||
|
dlg.setMinimumWidth(560)
|
||||||
|
v = QVBoxLayout(dlg)
|
||||||
|
v.setContentsMargins(20, 18, 20, 16)
|
||||||
|
v.setSpacing(10)
|
||||||
|
|
||||||
|
intro = QLabel(
|
||||||
|
"Add a game no launcher reports — a standalone mod launcher like SPT, an itch.io "
|
||||||
|
"download, or any hand-installed game.")
|
||||||
|
intro.setWordWrap(True)
|
||||||
|
v.addWidget(intro)
|
||||||
|
|
||||||
|
name_edit = QLineEdit()
|
||||||
|
name_edit.setPlaceholderText("SPT")
|
||||||
|
v.addWidget(QLabel("Game name"))
|
||||||
|
v.addWidget(name_edit)
|
||||||
|
|
||||||
|
cmd_edit = QLineEdit()
|
||||||
|
cmd_edit.setPlaceholderText("e.g. /run/media/.../Escape-From-Tarkov/tarkov.sh")
|
||||||
|
cmd_row = QHBoxLayout()
|
||||||
|
cmd_row.addWidget(cmd_edit, 1)
|
||||||
|
cmd_browse = QPushButton("Browse…")
|
||||||
|
cmd_row.addWidget(cmd_browse, 0)
|
||||||
|
v.addWidget(QLabel("Launch command / script (optional — enables launch + auto-capture)"))
|
||||||
|
v.addLayout(cmd_row)
|
||||||
|
|
||||||
|
log_edit = QLineEdit()
|
||||||
|
log_edit.setPlaceholderText("auto-detected from the script's folder (its logs/ subfolder)")
|
||||||
|
log_row = QHBoxLayout()
|
||||||
|
log_row.addWidget(log_edit, 1)
|
||||||
|
log_browse = QPushButton("Browse…")
|
||||||
|
log_row.addWidget(log_browse, 0)
|
||||||
|
v.addWidget(QLabel("Log folder (optional — read into crash diagnostics)"))
|
||||||
|
v.addLayout(log_row)
|
||||||
|
|
||||||
|
def _pick_command() -> None:
|
||||||
|
path, _ = QFileDialog.getOpenFileName(dlg, "Select the launch script/executable")
|
||||||
|
if path:
|
||||||
|
cmd_edit.setText(path)
|
||||||
|
|
||||||
|
def _pick_logdir() -> None:
|
||||||
|
path = QFileDialog.getExistingDirectory(dlg, "Select the game's log folder")
|
||||||
|
if path:
|
||||||
|
log_edit.setText(path)
|
||||||
|
|
||||||
|
cmd_browse.clicked.connect(_pick_command)
|
||||||
|
log_browse.clicked.connect(_pick_logdir)
|
||||||
|
|
||||||
|
buttons = QHBoxLayout()
|
||||||
|
buttons.addStretch(1)
|
||||||
|
cancel = QPushButton("Cancel")
|
||||||
|
cancel.clicked.connect(dlg.reject)
|
||||||
|
buttons.addWidget(cancel)
|
||||||
|
add = QPushButton("Add")
|
||||||
|
add.setObjectName("PrimaryButton")
|
||||||
|
add.setDefault(True)
|
||||||
|
add.clicked.connect(dlg.accept)
|
||||||
|
buttons.addWidget(add)
|
||||||
|
v.addLayout(buttons)
|
||||||
|
|
||||||
|
if dlg.exec() != QDialog.DialogCode.Accepted:
|
||||||
|
return
|
||||||
|
name = name_edit.text().strip()
|
||||||
|
if not name:
|
||||||
|
return
|
||||||
|
if customgames.add(name, command=cmd_edit.text().strip() or None,
|
||||||
|
logdir=log_edit.text().strip() or None):
|
||||||
|
self.refresh()
|
||||||
|
else:
|
||||||
|
QMessageBox.information(self, "Add game", f"'{name}' is already in your games.")
|
||||||
|
|
||||||
def _show_autocapture(self) -> None:
|
def _show_autocapture(self) -> None:
|
||||||
from ..core import wrap
|
from ..core import wrap
|
||||||
|
|
||||||
@@ -450,6 +543,49 @@ class GamesPage(QWidget):
|
|||||||
v.addLayout(buttons)
|
v.addLayout(buttons)
|
||||||
dlg.exec()
|
dlg.exec()
|
||||||
|
|
||||||
|
# --- import a crash dump (.dmp) ---------------------------------------------------
|
||||||
|
|
||||||
|
def _refresh_import_btn(self) -> None:
|
||||||
|
from ..core import ai
|
||||||
|
|
||||||
|
self._import_btn.setVisible(ai.is_configured())
|
||||||
|
|
||||||
|
def _import_dump(self) -> None:
|
||||||
|
from ..core import ai
|
||||||
|
|
||||||
|
if not ai.is_configured():
|
||||||
|
QMessageBox.information(
|
||||||
|
self, "RigDoctor",
|
||||||
|
"Set up an AI provider first (Settings → AI assistant) to analyze a crash dump.")
|
||||||
|
return
|
||||||
|
path, _ = QFileDialog.getOpenFileName(
|
||||||
|
self, "Import crash dump", os.path.expanduser("~"),
|
||||||
|
"Crash dumps (*.dmp);;All files (*)")
|
||||||
|
if not path:
|
||||||
|
return
|
||||||
|
self._import_btn.setEnabled(False)
|
||||||
|
self._status.setText("Parsing crash dump…")
|
||||||
|
threading.Thread(target=self._work_import, args=(path,), daemon=True).start()
|
||||||
|
|
||||||
|
def _work_import(self, path: str) -> None:
|
||||||
|
from ..core import minidump
|
||||||
|
|
||||||
|
try:
|
||||||
|
report = minidump.parse(path) # parses + runs minidump_stackwalk if installed
|
||||||
|
except Exception:
|
||||||
|
report = None
|
||||||
|
self._dump_parsed.emit(report)
|
||||||
|
|
||||||
|
def _on_dump_parsed(self, report) -> None:
|
||||||
|
self._import_btn.setEnabled(True)
|
||||||
|
self._status.setText("")
|
||||||
|
if report is None or not report.ok:
|
||||||
|
detail = report.error if report is not None else "Couldn't read the file."
|
||||||
|
QMessageBox.warning(
|
||||||
|
self, "Import crash dump", f"Couldn't analyze the dump — {detail}")
|
||||||
|
return
|
||||||
|
MinidumpDialog(report, self).exec()
|
||||||
|
|
||||||
# --- hard-crash recovery ----------------------------------------------------------
|
# --- hard-crash recovery ----------------------------------------------------------
|
||||||
|
|
||||||
def _check_crash(self) -> None:
|
def _check_crash(self) -> None:
|
||||||
@@ -498,6 +634,7 @@ class GamesPage(QWidget):
|
|||||||
# Viewing the list acknowledges the new games: clear the sidebar badge. The NEW
|
# Viewing the list acknowledges the new games: clear the sidebar badge. The NEW
|
||||||
# tags stay on the rows for this session so the user can still spot them.
|
# tags stay on the rows for this session so the user can still spot them.
|
||||||
super().showEvent(event)
|
super().showEvent(event)
|
||||||
|
self._refresh_import_btn() # AI may have been configured since this page was built
|
||||||
if self._new_appids:
|
if self._new_appids:
|
||||||
from ..core import steam
|
from ..core import steam
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,9 @@ class HealthPage(QWidget):
|
|||||||
self._status = QLabel("")
|
self._status = QLabel("")
|
||||||
self._status.setObjectName("Muted")
|
self._status.setObjectName("Muted")
|
||||||
header.addWidget(self._status)
|
header.addWidget(self._status)
|
||||||
|
self._stress_btn = QPushButton("Stress test…")
|
||||||
|
self._stress_btn.clicked.connect(self._open_stress)
|
||||||
|
header.addWidget(self._stress_btn)
|
||||||
self._run_btn = QPushButton("Run health report")
|
self._run_btn = QPushButton("Run health report")
|
||||||
self._run_btn.setObjectName("PrimaryButton")
|
self._run_btn.setObjectName("PrimaryButton")
|
||||||
self._run_btn.clicked.connect(self._run)
|
self._run_btn.clicked.connect(self._run)
|
||||||
@@ -59,6 +62,11 @@ class HealthPage(QWidget):
|
|||||||
|
|
||||||
QTimer.singleShot(300, self._run) # auto-run shortly after the window opens
|
QTimer.singleShot(300, self._run) # auto-run shortly after the window opens
|
||||||
|
|
||||||
|
def _open_stress(self) -> None:
|
||||||
|
from .stress_dialog import StressDialog
|
||||||
|
|
||||||
|
StressDialog(self).exec()
|
||||||
|
|
||||||
def _run(self) -> None:
|
def _run(self) -> None:
|
||||||
self._run_btn.setEnabled(False)
|
self._run_btn.setEnabled(False)
|
||||||
self._status.setText("Scanning logs, SMART, and driver…")
|
self._status.setText("Scanning logs, SMART, and driver…")
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from PySide6.QtWidgets import (
|
|||||||
QMainWindow,
|
QMainWindow,
|
||||||
QMessageBox,
|
QMessageBox,
|
||||||
QPushButton,
|
QPushButton,
|
||||||
|
QScrollArea,
|
||||||
QStackedWidget,
|
QStackedWidget,
|
||||||
QSystemTrayIcon,
|
QSystemTrayIcon,
|
||||||
QTextEdit,
|
QTextEdit,
|
||||||
@@ -51,6 +52,10 @@ _NAV = [
|
|||||||
("App", ["Settings", "Share"]),
|
("App", ["Settings", "Share"]),
|
||||||
]
|
]
|
||||||
_PAGES = [name for _section, names in _NAV for name in names]
|
_PAGES = [name for _section, names in _NAV for name in names]
|
||||||
|
# Pages that manage their own scrolling (pinned header + inner scroll) or must fill the
|
||||||
|
# viewport (the Share terminal) — these are added to the stack as-is; every other page is
|
||||||
|
# wrapped in a QScrollArea so it scrolls when too tall and doesn't pin the window's height.
|
||||||
|
_NO_WRAP = {"Dashboard", "System Health", "Inventory", "Share"}
|
||||||
_ICON = Path(__file__).parent / "assets" / "rigdoctor.svg"
|
_ICON = Path(__file__).parent / "assets" / "rigdoctor.svg"
|
||||||
|
|
||||||
|
|
||||||
@@ -68,7 +73,11 @@ class MainWindow(QMainWindow):
|
|||||||
|
|
||||||
central = QWidget()
|
central = QWidget()
|
||||||
self.setCentralWidget(central)
|
self.setCentralWidget(central)
|
||||||
layout = QHBoxLayout(central)
|
outer = QVBoxLayout(central)
|
||||||
|
outer.setContentsMargins(0, 0, 0, 0)
|
||||||
|
outer.setSpacing(0)
|
||||||
|
body = QWidget()
|
||||||
|
layout = QHBoxLayout(body)
|
||||||
layout.setContentsMargins(0, 0, 0, 0)
|
layout.setContentsMargins(0, 0, 0, 0)
|
||||||
layout.setSpacing(0)
|
layout.setSpacing(0)
|
||||||
|
|
||||||
@@ -100,11 +109,14 @@ class MainWindow(QMainWindow):
|
|||||||
"Share": self.share_page,
|
"Share": self.share_page,
|
||||||
}
|
}
|
||||||
for name in _PAGES:
|
for name in _PAGES:
|
||||||
self._stack.addWidget(self._pages[name])
|
page = self._pages[name]
|
||||||
|
self._stack.addWidget(page if name in _NO_WRAP else self._scrollable(page))
|
||||||
content_layout.addWidget(self._stack)
|
content_layout.addWidget(self._stack)
|
||||||
|
|
||||||
layout.addWidget(self._build_sidebar())
|
layout.addWidget(self._build_sidebar())
|
||||||
layout.addWidget(content, 1)
|
layout.addWidget(content, 1)
|
||||||
|
outer.addWidget(body, 1)
|
||||||
|
outer.addWidget(self._build_footer())
|
||||||
|
|
||||||
self._worker = SamplerWorker(interval=interval)
|
self._worker = SamplerWorker(interval=interval)
|
||||||
self._worker.sampled.connect(self.dashboard.update_sample)
|
self._worker.sampled.connect(self.dashboard.update_sample)
|
||||||
@@ -216,9 +228,6 @@ class MainWindow(QMainWindow):
|
|||||||
v.addStretch(1)
|
v.addStretch(1)
|
||||||
live = QLabel(f'<span style="color:{ACCENT};">●</span> <span style="color:{MUTED};">Live</span>')
|
live = QLabel(f'<span style="color:{ACCENT};">●</span> <span style="color:{MUTED};">Live</span>')
|
||||||
v.addWidget(live)
|
v.addWidget(live)
|
||||||
version = QLabel(f"v{__version__}")
|
|
||||||
version.setObjectName("Muted")
|
|
||||||
v.addWidget(version)
|
|
||||||
changelog_btn = QPushButton("Changelog")
|
changelog_btn = QPushButton("Changelog")
|
||||||
changelog_btn.setObjectName("LinkButton")
|
changelog_btn.setObjectName("LinkButton")
|
||||||
changelog_btn.setCursor(Qt.CursorShape.PointingHandCursor)
|
changelog_btn.setCursor(Qt.CursorShape.PointingHandCursor)
|
||||||
@@ -248,6 +257,27 @@ class MainWindow(QMainWindow):
|
|||||||
v.addWidget(self._restart_btn)
|
v.addWidget(self._restart_btn)
|
||||||
return bar
|
return bar
|
||||||
|
|
||||||
|
def _scrollable(self, page: QWidget) -> QScrollArea:
|
||||||
|
"""Wrap a page so it scrolls when taller than the window — and so the window can shrink
|
||||||
|
below the page's natural height instead of being pinned to it."""
|
||||||
|
area = QScrollArea()
|
||||||
|
area.setWidget(page)
|
||||||
|
area.setWidgetResizable(True)
|
||||||
|
area.setFrameShape(QFrame.Shape.NoFrame)
|
||||||
|
area.setHorizontalScrollBarPolicy(Qt.ScrollBarPolicy.ScrollBarAlwaysOff)
|
||||||
|
return area
|
||||||
|
|
||||||
|
def _build_footer(self) -> QFrame:
|
||||||
|
bar = QFrame()
|
||||||
|
bar.setObjectName("Footer")
|
||||||
|
h = QHBoxLayout(bar)
|
||||||
|
h.setContentsMargins(14, 5, 16, 5)
|
||||||
|
h.addStretch(1)
|
||||||
|
version = QLabel(f"RigDoctor v{__version__}")
|
||||||
|
version.setObjectName("Muted")
|
||||||
|
h.addWidget(version)
|
||||||
|
return bar
|
||||||
|
|
||||||
def _restart(self) -> None:
|
def _restart(self) -> None:
|
||||||
gui = os.path.join(os.path.dirname(sys.executable), "rigdoctor-gui")
|
gui = os.path.join(os.path.dirname(sys.executable), "rigdoctor-gui")
|
||||||
if os.path.exists(gui):
|
if os.path.exists(gui):
|
||||||
@@ -259,6 +289,9 @@ class MainWindow(QMainWindow):
|
|||||||
def _apply_update(self) -> None:
|
def _apply_update(self) -> None:
|
||||||
if not self._latest_tag:
|
if not self._latest_tag:
|
||||||
return
|
return
|
||||||
|
if updates.install_kind() != "pip": # apt/source: can't pip-update — show the command
|
||||||
|
QMessageBox.information(self, "Update RigDoctor", updates.update_hint())
|
||||||
|
return
|
||||||
box = QMessageBox(self)
|
box = QMessageBox(self)
|
||||||
box.setWindowTitle(f"Update to {self._latest_tag}")
|
box.setWindowTitle(f"Update to {self._latest_tag}")
|
||||||
box.setText(f"Update RigDoctor to {self._latest_tag}?")
|
box.setText(f"Update RigDoctor to {self._latest_tag}?")
|
||||||
@@ -424,7 +457,7 @@ class MainWindow(QMainWindow):
|
|||||||
self._update_label.setText("update check unavailable")
|
self._update_label.setText("update check unavailable")
|
||||||
elif state == updates.AVAILABLE:
|
elif state == updates.AVAILABLE:
|
||||||
self._update_label.setText(f'<span style="color:{GOOD};">{tag} available</span>')
|
self._update_label.setText(f'<span style="color:{GOOD};">{tag} available</span>')
|
||||||
self._update_btn.setText(f"Update to {tag}")
|
self._update_btn.setText(f"Update to {tag}" if updates.install_kind() == "pip" else "How to update")
|
||||||
self._update_btn.setVisible(True)
|
self._update_btn.setVisible(True)
|
||||||
if self._alert_monitor.enabled and tag != self._notified_update_tag:
|
if self._alert_monitor.enabled and tag != self._notified_update_tag:
|
||||||
self._notified_update_tag = tag # once per version, not every poll
|
self._notified_update_tag = tag # once per version, not every poll
|
||||||
|
|||||||
@@ -0,0 +1,182 @@
|
|||||||
|
"""Results view for an imported crash dump (.dmp, M14): parsed summary + AI explanation.
|
||||||
|
|
||||||
|
Mirrors :class:`DiagnosticDialog` — the same opt-in, streamed "Explain with AI" flow (D24),
|
||||||
|
applied to a Windows minidump parsed by :mod:`core.minidump` instead of a sensor capture.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PySide6.QtCore import Qt, Signal
|
||||||
|
from PySide6.QtGui import QFont, QTextCursor
|
||||||
|
from PySide6.QtWidgets import (
|
||||||
|
QDialog,
|
||||||
|
QFrame,
|
||||||
|
QHBoxLayout,
|
||||||
|
QLabel,
|
||||||
|
QMessageBox,
|
||||||
|
QPushButton,
|
||||||
|
QScrollArea,
|
||||||
|
QTextEdit,
|
||||||
|
QVBoxLayout,
|
||||||
|
QWidget,
|
||||||
|
)
|
||||||
|
|
||||||
|
from ..core import minidump
|
||||||
|
from .widgets import finding_card
|
||||||
|
|
||||||
|
|
||||||
|
class MinidumpDialog(QDialog):
|
||||||
|
_chunk = Signal(str) # streamed token delta (worker thread -> GUI)
|
||||||
|
_explained = Signal(object) # (ok, full_text) when the AI stream finishes
|
||||||
|
|
||||||
|
def __init__(self, report: minidump.MinidumpReport, parent=None) -> None:
|
||||||
|
super().__init__(parent)
|
||||||
|
self._report = report
|
||||||
|
self._stream_view = None
|
||||||
|
self._stream_status = None
|
||||||
|
self._chunk.connect(self._on_chunk)
|
||||||
|
self._explained.connect(self._on_explained)
|
||||||
|
name = Path(report.path).name
|
||||||
|
self.setWindowTitle(f"Crash dump — {name}")
|
||||||
|
self.resize(660, 680)
|
||||||
|
|
||||||
|
root = QVBoxLayout(self)
|
||||||
|
root.setContentsMargins(20, 18, 20, 16)
|
||||||
|
root.setSpacing(14)
|
||||||
|
|
||||||
|
title = QLabel(f"Crash dump — {name}")
|
||||||
|
title.setObjectName("PageTitle")
|
||||||
|
root.addWidget(title)
|
||||||
|
|
||||||
|
scroll = QScrollArea()
|
||||||
|
scroll.setWidgetResizable(True)
|
||||||
|
scroll.setFrameShape(QFrame.Shape.NoFrame)
|
||||||
|
scroll.setStyleSheet("background: transparent;")
|
||||||
|
body = QWidget()
|
||||||
|
col = QVBoxLayout(body)
|
||||||
|
col.setContentsMargins(0, 0, 0, 0)
|
||||||
|
col.setSpacing(10)
|
||||||
|
col.setAlignment(Qt.AlignmentFlag.AlignTop)
|
||||||
|
|
||||||
|
# Parsed summary (crash reason / faulting module / OS / CPU / modules) — monospace.
|
||||||
|
summary_head = QLabel("Dump summary")
|
||||||
|
summary_head.setStyleSheet("font-weight: 700; background: transparent;")
|
||||||
|
col.addWidget(summary_head)
|
||||||
|
summary = QLabel(minidump.to_text(report))
|
||||||
|
summary.setObjectName("Report")
|
||||||
|
summary.setFont(QFont("monospace"))
|
||||||
|
summary.setTextInteractionFlags(Qt.TextInteractionFlag.TextSelectableByMouse)
|
||||||
|
summary.setWordWrap(False)
|
||||||
|
summary.setStyleSheet(
|
||||||
|
"background: #0d0f13; color: #cfd3da; border: 1px solid #2a2f39; "
|
||||||
|
"border-radius: 8px; padding: 10px;"
|
||||||
|
)
|
||||||
|
col.addWidget(summary)
|
||||||
|
|
||||||
|
findings = minidump.to_findings(report)
|
||||||
|
find_head = QLabel(f"Findings ({len(findings)})")
|
||||||
|
find_head.setStyleSheet("font-weight: 700; background: transparent;")
|
||||||
|
col.addWidget(find_head)
|
||||||
|
for finding in findings:
|
||||||
|
col.addWidget(finding_card(finding))
|
||||||
|
|
||||||
|
if report.stackwalk: # only when an external stackwalker was available
|
||||||
|
sw_head = QLabel("minidump_stackwalk output")
|
||||||
|
sw_head.setStyleSheet("font-weight: 700; background: transparent;")
|
||||||
|
col.addWidget(sw_head)
|
||||||
|
sw = QTextEdit()
|
||||||
|
sw.setObjectName("Report")
|
||||||
|
sw.setReadOnly(True)
|
||||||
|
sw.setFont(QFont("monospace"))
|
||||||
|
sw.setPlainText(report.stackwalk)
|
||||||
|
sw.setMinimumHeight(160)
|
||||||
|
col.addWidget(sw)
|
||||||
|
|
||||||
|
scroll.setWidget(body)
|
||||||
|
root.addWidget(scroll, 1)
|
||||||
|
|
||||||
|
buttons = QHBoxLayout()
|
||||||
|
self._explain_btn = QPushButton("Explain with AI")
|
||||||
|
self._explain_btn.clicked.connect(self._explain_with_ai)
|
||||||
|
from ..core import ai
|
||||||
|
self._explain_btn.setVisible(ai.is_configured()) # opt-in only; hidden if not set up
|
||||||
|
buttons.addWidget(self._explain_btn)
|
||||||
|
buttons.addStretch(1)
|
||||||
|
close = QPushButton("Close")
|
||||||
|
close.setObjectName("PrimaryButton")
|
||||||
|
close.clicked.connect(self.accept)
|
||||||
|
buttons.addWidget(close)
|
||||||
|
root.addLayout(buttons)
|
||||||
|
|
||||||
|
# --- AI explanation (M14, D24) — streamed; runs only on this button press ----------
|
||||||
|
def _explain_with_ai(self) -> None:
|
||||||
|
from ..core import ai
|
||||||
|
|
||||||
|
if not ai.is_local(): # cloud provider → explicit consent before sending data
|
||||||
|
confirm = QMessageBox.question(
|
||||||
|
self, "Send to AI provider",
|
||||||
|
f"This sends the parsed crash dump to {ai.provider_label()}.\n\nContinue?",
|
||||||
|
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No,
|
||||||
|
QMessageBox.StandardButton.No,
|
||||||
|
)
|
||||||
|
if confirm != QMessageBox.StandardButton.Yes:
|
||||||
|
return
|
||||||
|
self._explain_btn.setEnabled(False)
|
||||||
|
dialog = self._open_stream_dialog()
|
||||||
|
threading.Thread(target=self._work_explain, daemon=True).start()
|
||||||
|
dialog.exec() # streaming fills the view live via signals during this nested loop
|
||||||
|
self._stream_view = self._stream_status = None
|
||||||
|
self._explain_btn.setEnabled(True)
|
||||||
|
|
||||||
|
def _work_explain(self) -> None:
|
||||||
|
from ..core import ai
|
||||||
|
|
||||||
|
text = minidump.to_ai_text(self._report)
|
||||||
|
ok, reply = ai.explain_stream(text, on_chunk=lambda d: self._chunk.emit(d))
|
||||||
|
self._explained.emit((ok, reply))
|
||||||
|
|
||||||
|
def _on_chunk(self, delta: str) -> None:
|
||||||
|
if self._stream_view is None:
|
||||||
|
return
|
||||||
|
self._stream_view.moveCursor(QTextCursor.MoveOperation.End)
|
||||||
|
self._stream_view.insertPlainText(delta) # live plain text as tokens arrive
|
||||||
|
self._stream_view.ensureCursorVisible()
|
||||||
|
|
||||||
|
def _on_explained(self, result) -> None:
|
||||||
|
ok, text = result
|
||||||
|
if self._stream_view is not None:
|
||||||
|
if ok:
|
||||||
|
self._stream_view.setMarkdown(text) # re-render the finished answer as Markdown
|
||||||
|
else:
|
||||||
|
self._stream_view.setPlainText(f"AI explanation failed:\n\n{text}")
|
||||||
|
if self._stream_status is not None:
|
||||||
|
self._stream_status.setText(
|
||||||
|
"AI-generated suggestions — verify before acting, especially anything that changes "
|
||||||
|
"settings or data." if ok else "The request failed.")
|
||||||
|
|
||||||
|
def _open_stream_dialog(self) -> QDialog:
|
||||||
|
"""A live dialog the AI streams into; finalized to rendered Markdown when done."""
|
||||||
|
from ..core import ai
|
||||||
|
|
||||||
|
dlg = QDialog(self)
|
||||||
|
dlg.setWindowTitle(f"AI explanation — {ai.provider_label()}")
|
||||||
|
dlg.resize(620, 520)
|
||||||
|
lay = QVBoxLayout(dlg)
|
||||||
|
view = QTextEdit()
|
||||||
|
view.setObjectName("Report")
|
||||||
|
view.setReadOnly(True)
|
||||||
|
lay.addWidget(view)
|
||||||
|
status = QLabel("Streaming from the model…")
|
||||||
|
status.setObjectName("Muted")
|
||||||
|
status.setWordWrap(True)
|
||||||
|
lay.addWidget(status)
|
||||||
|
close = QPushButton("Close")
|
||||||
|
close.setObjectName("PrimaryButton")
|
||||||
|
close.clicked.connect(dlg.accept)
|
||||||
|
lay.addWidget(close, alignment=Qt.AlignmentFlag.AlignRight)
|
||||||
|
self._stream_view = view
|
||||||
|
self._stream_status = status
|
||||||
|
return dlg
|
||||||
@@ -0,0 +1,157 @@
|
|||||||
|
"""GPU stress + thermal-monitor dialog (GUI front-end for core/stress.py).
|
||||||
|
|
||||||
|
Runs the stress monitor in a background thread, streams a live one-line readout, and shows the
|
||||||
|
rendered result (telemetry stats + verdict) when it finishes. A Stop button ends the run early
|
||||||
|
via a cooperative flag; closing the dialog mid-run stops it too.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import threading
|
||||||
|
|
||||||
|
from PySide6.QtCore import Qt, Signal
|
||||||
|
from PySide6.QtGui import QFont
|
||||||
|
from PySide6.QtWidgets import (
|
||||||
|
QDialog,
|
||||||
|
QHBoxLayout,
|
||||||
|
QLabel,
|
||||||
|
QLineEdit,
|
||||||
|
QPushButton,
|
||||||
|
QSpinBox,
|
||||||
|
QTextEdit,
|
||||||
|
QVBoxLayout,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class StressDialog(QDialog):
|
||||||
|
_tick = Signal(str) # live one-line readout (worker thread -> GUI)
|
||||||
|
_done = Signal(object) # stress.StressResult when the run finishes
|
||||||
|
|
||||||
|
def __init__(self, parent=None) -> None:
|
||||||
|
super().__init__(parent)
|
||||||
|
self._stop = threading.Event()
|
||||||
|
self._running = False
|
||||||
|
self._tick.connect(self._on_tick)
|
||||||
|
self._done.connect(self._on_done)
|
||||||
|
self.setWindowTitle("GPU stress + thermal monitor")
|
||||||
|
self.resize(640, 460)
|
||||||
|
|
||||||
|
root = QVBoxLayout(self)
|
||||||
|
root.setContentsMargins(20, 18, 20, 16)
|
||||||
|
root.setSpacing(12)
|
||||||
|
|
||||||
|
intro = QLabel(
|
||||||
|
"Run a GPU load and closely watch temps. Reports peak/sustained temps, time spent "
|
||||||
|
"hot, throttling, and any GPU fault (Xid / driver freeze) during the run.")
|
||||||
|
intro.setWordWrap(True)
|
||||||
|
root.addWidget(intro)
|
||||||
|
|
||||||
|
from ..core import stress
|
||||||
|
loaders = stress.available_loaders()
|
||||||
|
self._mode = QLabel(
|
||||||
|
f"Load tool detected: {loaders[0]} — it'll drive the load." if loaders else
|
||||||
|
"No GPU load tool installed → MONITOR-ONLY: start this, then launch your game; "
|
||||||
|
"it tracks temps while you play. (Or give a command below.)")
|
||||||
|
self._mode.setObjectName("Muted")
|
||||||
|
self._mode.setWordWrap(True)
|
||||||
|
root.addWidget(self._mode)
|
||||||
|
|
||||||
|
form = QHBoxLayout()
|
||||||
|
form.addWidget(QLabel("Duration (s):"))
|
||||||
|
self._duration = QSpinBox()
|
||||||
|
self._duration.setRange(5, 3600)
|
||||||
|
self._duration.setValue(120)
|
||||||
|
form.addWidget(self._duration)
|
||||||
|
form.addSpacing(12)
|
||||||
|
form.addWidget(QLabel("Command (optional):"))
|
||||||
|
self._command = QLineEdit()
|
||||||
|
self._command.setPlaceholderText("e.g. /…/tarkov.sh or gpu-burn 60")
|
||||||
|
form.addWidget(self._command, 1)
|
||||||
|
root.addLayout(form)
|
||||||
|
|
||||||
|
self._live = QLabel("—")
|
||||||
|
self._live.setFont(QFont("monospace"))
|
||||||
|
self._live.setStyleSheet("background: #0d0f13; color: #cfd3da; border: 1px solid #2a2f39; "
|
||||||
|
"border-radius: 8px; padding: 8px;")
|
||||||
|
root.addWidget(self._live)
|
||||||
|
|
||||||
|
self._report = QTextEdit()
|
||||||
|
self._report.setReadOnly(True)
|
||||||
|
self._report.setFont(QFont("monospace"))
|
||||||
|
self._report.setVisible(False)
|
||||||
|
root.addWidget(self._report, 1)
|
||||||
|
|
||||||
|
buttons = QHBoxLayout()
|
||||||
|
buttons.addStretch(1)
|
||||||
|
self._stop_btn = QPushButton("Stop")
|
||||||
|
self._stop_btn.setEnabled(False)
|
||||||
|
self._stop_btn.clicked.connect(self._on_stop)
|
||||||
|
buttons.addWidget(self._stop_btn)
|
||||||
|
self._start_btn = QPushButton("Start")
|
||||||
|
self._start_btn.setObjectName("PrimaryButton")
|
||||||
|
self._start_btn.clicked.connect(self._on_start)
|
||||||
|
buttons.addWidget(self._start_btn)
|
||||||
|
root.addLayout(buttons)
|
||||||
|
|
||||||
|
def _on_start(self) -> None:
|
||||||
|
if self._running:
|
||||||
|
return
|
||||||
|
self._running = True
|
||||||
|
self._stop.clear()
|
||||||
|
self._start_btn.setEnabled(False)
|
||||||
|
self._stop_btn.setEnabled(True)
|
||||||
|
self._report.setVisible(False)
|
||||||
|
self._live.setText("starting…")
|
||||||
|
duration = float(self._duration.value())
|
||||||
|
command_text = self._command.text().strip()
|
||||||
|
threading.Thread(target=self._work, args=(duration, command_text), daemon=True).start()
|
||||||
|
|
||||||
|
def _work(self, duration: float, command_text: str) -> None:
|
||||||
|
import shlex
|
||||||
|
|
||||||
|
from ..core import stress
|
||||||
|
|
||||||
|
command = shlex.split(command_text) if command_text else None
|
||||||
|
|
||||||
|
def _tick(sample, elapsed) -> None:
|
||||||
|
by = {r.key: r for r in sample.readings}
|
||||||
|
from ..render import format_raw
|
||||||
|
bits = [f"{elapsed:5.0f}s"]
|
||||||
|
for key, tag in (("gpu.temp", "core"), ("gpu.power", "pwr"),
|
||||||
|
("gpu.util", "util"), ("gpu.clock.core", "clk"),
|
||||||
|
("gpu.temp.memory", "vram")):
|
||||||
|
r = by.get(key)
|
||||||
|
if r is not None and r.value is not None:
|
||||||
|
bits.append(f"{tag} {format_raw(r.value, r.unit)}")
|
||||||
|
self._tick.emit(" ".join(bits))
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = stress.run(duration=duration, interval=0.5, command=command,
|
||||||
|
on_tick=_tick, should_stop=self._stop.is_set)
|
||||||
|
except Exception as exc: # never let a worker crash take down the dialog
|
||||||
|
result = exc
|
||||||
|
self._done.emit(result)
|
||||||
|
|
||||||
|
def _on_tick(self, text: str) -> None:
|
||||||
|
self._live.setText(text)
|
||||||
|
|
||||||
|
def _on_done(self, result) -> None:
|
||||||
|
from ..render import render_stress
|
||||||
|
|
||||||
|
self._running = False
|
||||||
|
self._start_btn.setEnabled(True)
|
||||||
|
self._stop_btn.setEnabled(False)
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
self._report.setPlainText(f"Stress run failed: {result}")
|
||||||
|
else:
|
||||||
|
self._report.setPlainText(render_stress(result))
|
||||||
|
self._report.setVisible(True)
|
||||||
|
|
||||||
|
def _on_stop(self) -> None:
|
||||||
|
self._stop.set()
|
||||||
|
self._stop_btn.setEnabled(False)
|
||||||
|
self._live.setText("stopping…")
|
||||||
|
|
||||||
|
def closeEvent(self, event) -> None: # stop the run if the dialog is closed mid-flight
|
||||||
|
self._stop.set()
|
||||||
|
super().closeEvent(event)
|
||||||
@@ -68,6 +68,8 @@ QMainWindow, #ContentArea, #Page {{ background: {BG}; }}
|
|||||||
QLabel {{ background: transparent; }}
|
QLabel {{ background: transparent; }}
|
||||||
|
|
||||||
#Sidebar {{ background: {SIDEBAR}; border-right: 1px solid {CARD_BORDER}; }}
|
#Sidebar {{ background: {SIDEBAR}; border-right: 1px solid {CARD_BORDER}; }}
|
||||||
|
#Footer {{ background: {SIDEBAR}; border-top: 1px solid {CARD_BORDER}; }}
|
||||||
|
#Footer QLabel {{ font-size: 11px; }}
|
||||||
#AppTitle {{ font-size: 17px; font-weight: 800; }}
|
#AppTitle {{ font-size: 17px; font-weight: 800; }}
|
||||||
#AppSubtitle {{ color: {MUTED}; font-size: 11px; }}
|
#AppSubtitle {{ color: {MUTED}; font-size: 11px; }}
|
||||||
|
|
||||||
|
|||||||
@@ -118,6 +118,32 @@ def render_health(findings: list, title: str = "Health report") -> str:
|
|||||||
return "\n".join(lines).rstrip()
|
return "\n".join(lines).rstrip()
|
||||||
|
|
||||||
|
|
||||||
|
def render_stress(result) -> str:
|
||||||
|
"""Render a stress.StressResult: telemetry stats, temp dwell time, and the verdict."""
|
||||||
|
lines = ["GPU stress + thermal monitor", ""]
|
||||||
|
lines.append(f" Load : {result.load}")
|
||||||
|
lines.append(f" Duration : {_fmt_duration(result.duration)} · {result.samples} samples "
|
||||||
|
f"@ {result.interval:g}s" + (" (stopped early)" if result.aborted else ""))
|
||||||
|
if result.stats:
|
||||||
|
lines += ["", f" {'Metric':<22}{'min':>12}{'avg':>12}{'max':>12}"]
|
||||||
|
for s in result.stats:
|
||||||
|
u = s.unit
|
||||||
|
lines.append(f" {s.label:<22}{format_raw(s.min, u):>12}{format_raw(s.avg, u):>12}"
|
||||||
|
f"{format_raw(s.max, u):>12}")
|
||||||
|
if result.time_above:
|
||||||
|
spans = " ".join(f"≥{th}°C: {_fmt_duration(secs)}" for th, secs in sorted(result.time_above.items()))
|
||||||
|
lines += ["", f" Time at temp (core): {spans}"]
|
||||||
|
if result.max_power is not None and result.power_limit:
|
||||||
|
cap = " — hit the power cap" if result.power_capped else ""
|
||||||
|
lines.append(f" Power peak: {result.max_power:.0f} W of {result.power_limit:.0f} W limit{cap}")
|
||||||
|
if result.throttle_reasons:
|
||||||
|
lines.append(f" Throttling: {', '.join(result.throttle_reasons)}")
|
||||||
|
if result.faults:
|
||||||
|
lines.append(f" Faults : {'; '.join(result.faults)}")
|
||||||
|
lines += ["", f"[{_SEV_LABEL.get(result.severity, '?')}] {result.verdict}"]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def render_summary(summary: Summary, log_path=None) -> str:
|
def render_summary(summary: Summary, log_path=None) -> str:
|
||||||
if summary.samples == 0 and not summary.events:
|
if summary.samples == 0 and not summary.events:
|
||||||
where = f" ({log_path})" if log_path else ""
|
where = f" ({log_path})" if log_path else ""
|
||||||
|
|||||||
@@ -0,0 +1,85 @@
|
|||||||
|
"""Tests for user-added games (M6): add/remove/scan of titles no launcher reports (e.g. SPT)."""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from rigdoctor.core import customgames
|
||||||
|
|
||||||
|
|
||||||
|
class CustomGamesTests(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self._tmp = tempfile.TemporaryDirectory()
|
||||||
|
self._file = Path(self._tmp.name) / "custom-games.json"
|
||||||
|
self._patch = mock.patch.object(customgames.config, "CUSTOM_GAMES_FILE", self._file)
|
||||||
|
self._patch.start()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self._patch.stop()
|
||||||
|
self._tmp.cleanup()
|
||||||
|
|
||||||
|
def test_missing_file_scans_empty(self):
|
||||||
|
self.assertEqual(customgames.scan(), [])
|
||||||
|
self.assertEqual(customgames.names(), [])
|
||||||
|
|
||||||
|
def test_add_then_scan_returns_game(self):
|
||||||
|
self.assertTrue(customgames.add("SPT"))
|
||||||
|
games = customgames.scan()
|
||||||
|
self.assertEqual(len(games), 1)
|
||||||
|
self.assertEqual(games[0].name, "SPT")
|
||||||
|
self.assertEqual(games[0].launcher, "custom")
|
||||||
|
self.assertTrue(self._file.exists()) # persisted
|
||||||
|
|
||||||
|
def test_add_is_idempotent_case_insensitive(self):
|
||||||
|
self.assertTrue(customgames.add("SPT"))
|
||||||
|
self.assertFalse(customgames.add("spt")) # already present
|
||||||
|
self.assertFalse(customgames.add(" ")) # blank
|
||||||
|
self.assertEqual(customgames.names(), ["SPT"])
|
||||||
|
|
||||||
|
def test_remove(self):
|
||||||
|
customgames.add("SPT")
|
||||||
|
customgames.add("Minecraft")
|
||||||
|
self.assertTrue(customgames.remove("spt")) # case-insensitive
|
||||||
|
self.assertEqual(customgames.names(), ["Minecraft"])
|
||||||
|
self.assertFalse(customgames.remove("nope"))
|
||||||
|
|
||||||
|
def test_scan_sorted_by_name(self):
|
||||||
|
for n in ("Zomboid", "Apex", "SPT"):
|
||||||
|
customgames.add(n)
|
||||||
|
self.assertEqual([g.name for g in customgames.scan()], ["Apex", "SPT", "Zomboid"])
|
||||||
|
|
||||||
|
def test_command_and_logdir_stored_and_resolved(self):
|
||||||
|
logs = Path(self._tmp.name) / "logs"
|
||||||
|
logs.mkdir()
|
||||||
|
sh = Path(self._tmp.name) / "tarkov.sh"
|
||||||
|
sh.write_text("#!/bin/sh\n")
|
||||||
|
self.assertTrue(customgames.add("SPT", command=str(sh), logdir=str(logs)))
|
||||||
|
self.assertEqual(customgames.command("SPT"), [str(sh)])
|
||||||
|
self.assertEqual(customgames.log_dir("SPT"), str(logs))
|
||||||
|
|
||||||
|
def test_logdir_inferred_from_sibling_logs(self):
|
||||||
|
# A command with a sibling logs/ dir (SPT's layout) → logdir auto-detected.
|
||||||
|
sh = Path(self._tmp.name) / "tarkov.sh"
|
||||||
|
sh.write_text("#!/bin/sh\n")
|
||||||
|
(Path(self._tmp.name) / "logs").mkdir()
|
||||||
|
self.assertTrue(customgames.add("SPT", command=str(sh)))
|
||||||
|
self.assertEqual(customgames.log_dir("SPT"), str(Path(self._tmp.name) / "logs"))
|
||||||
|
|
||||||
|
def test_no_command_resolves_to_none(self):
|
||||||
|
customgames.add("SPT")
|
||||||
|
self.assertIsNone(customgames.command("SPT"))
|
||||||
|
self.assertIsNone(customgames.command("missing"))
|
||||||
|
self.assertIsNone(customgames.log_dir("SPT"))
|
||||||
|
|
||||||
|
def test_corrupt_file_degrades_to_empty(self):
|
||||||
|
self._file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._file.write_text("{not json")
|
||||||
|
self.assertEqual(customgames.scan(), [])
|
||||||
|
# and a subsequent add still works (overwrites the garbage)
|
||||||
|
self.assertTrue(customgames.add("SPT"))
|
||||||
|
self.assertEqual(customgames.names(), ["SPT"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -0,0 +1,67 @@
|
|||||||
|
"""Tests for display detection (Mutter D-Bus JSON + xrandr parsers)."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from rigdoctor.core import displays
|
||||||
|
|
||||||
|
# Minimal Mutter GetCurrentState (busctl --json) shape: current mode is 60 Hz, panel max 165 Hz.
|
||||||
|
_MUTTER_60 = (
|
||||||
|
'{"type":"x","data":[1,[[["DP-1","SAM","LC34G55T","S"],['
|
||||||
|
'["3440x1440@60",3440,1440,60.0,1.0,[1.0],{"is-current":{"type":"b","data":true}}],'
|
||||||
|
'["3440x1440@165",3440,1440,165.0,1.0,[1.0],{"is-preferred":{"type":"b","data":true}}]'
|
||||||
|
'],{}]],[],{}]}'
|
||||||
|
)
|
||||||
|
_MUTTER_MAX = (
|
||||||
|
'{"type":"x","data":[1,[[["DP-1","SAM","LC34G55T","S"],['
|
||||||
|
'["3440x1440@165",3440,1440,165.0,1.0,[1.0],{"is-current":{"type":"b","data":true}}],'
|
||||||
|
'["3440x1440@60",3440,1440,60.0,1.0,[1.0],{}]'
|
||||||
|
'],{}]],[],{}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
_XRANDR_60 = """Screen 0: minimum 8 x 8, current 3440 x 1440, maximum 16384 x 16384
|
||||||
|
DP-1 connected primary 3440x1440+0+0 (normal left inverted right x axis y axis) 800mm x 335mm
|
||||||
|
3440x1440 60.00*+ 165.00 100.00
|
||||||
|
2560x1440 165.00 60.00
|
||||||
|
HDMI-1 disconnected (normal left inverted right x axis y axis)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class MutterParseTests(unittest.TestCase):
|
||||||
|
def test_parses_and_flags_higher_refresh(self):
|
||||||
|
mons = displays._parse_mutter(_MUTTER_60)
|
||||||
|
self.assertEqual(len(mons), 1)
|
||||||
|
m = mons[0]
|
||||||
|
self.assertEqual(m.connector, "DP-1")
|
||||||
|
self.assertEqual(m.name, "Samsung LC34G55T") # PNP code SAM mapped
|
||||||
|
self.assertEqual((m.width, m.height), (3440, 1440))
|
||||||
|
self.assertEqual(round(m.refresh), 60)
|
||||||
|
self.assertEqual(round(m.max_refresh), 165)
|
||||||
|
self.assertTrue(m.can_go_faster)
|
||||||
|
|
||||||
|
def test_at_max_is_not_flagged(self):
|
||||||
|
m = displays._parse_mutter(_MUTTER_MAX)[0]
|
||||||
|
self.assertEqual(round(m.refresh), 165)
|
||||||
|
self.assertFalse(m.can_go_faster)
|
||||||
|
|
||||||
|
def test_garbage_returns_empty(self):
|
||||||
|
self.assertEqual(displays._parse_mutter("not json"), [])
|
||||||
|
self.assertEqual(displays._parse_mutter("{}"), [])
|
||||||
|
|
||||||
|
|
||||||
|
class XrandrParseTests(unittest.TestCase):
|
||||||
|
def test_current_and_max_refresh(self):
|
||||||
|
mons = displays._parse_xrandr(_XRANDR_60)
|
||||||
|
self.assertEqual(len(mons), 1) # disconnected output ignored
|
||||||
|
m = mons[0]
|
||||||
|
self.assertEqual(m.connector, "DP-1")
|
||||||
|
self.assertEqual((m.width, m.height), (3440, 1440))
|
||||||
|
self.assertEqual(round(m.refresh), 60)
|
||||||
|
self.assertEqual(round(m.max_refresh), 165)
|
||||||
|
self.assertTrue(m.can_go_faster)
|
||||||
|
|
||||||
|
def test_empty_returns_empty(self):
|
||||||
|
self.assertEqual(displays._parse_xrandr(""), [])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -0,0 +1,99 @@
|
|||||||
|
"""Tests for drive health parsing & findings (synthetic smartctl JSON)."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from dataclasses import asdict
|
||||||
|
|
||||||
|
from rigdoctor.core import drives
|
||||||
|
from rigdoctor.core.health import CRITICAL, INFO, OK, WARNING
|
||||||
|
|
||||||
|
_NVME_OK = {
|
||||||
|
"model_name": "Samsung SSD 980 PRO 1TB",
|
||||||
|
"device": {"protocol": "NVMe"},
|
||||||
|
"smart_status": {"passed": True},
|
||||||
|
"temperature": {"current": 41},
|
||||||
|
"power_on_time": {"hours": 1234},
|
||||||
|
"nvme_smart_health_information_log": {
|
||||||
|
"percentage_used": 3, "available_spare": 100, "available_spare_threshold": 10,
|
||||||
|
"media_errors": 0, "data_units_written": 200_000_000, # ~102 TB
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_NVME_WORN = {
|
||||||
|
"model_name": "Worn NVMe",
|
||||||
|
"device": {"protocol": "NVMe"},
|
||||||
|
"smart_status": {"passed": True},
|
||||||
|
"nvme_smart_health_information_log": {"percentage_used": 96, "available_spare": 100,
|
||||||
|
"available_spare_threshold": 10},
|
||||||
|
}
|
||||||
|
|
||||||
|
_SATA_FAILING = {
|
||||||
|
"model_name": "Samsung SSD 870 QVO 1TB",
|
||||||
|
"device": {"protocol": "ATA"},
|
||||||
|
"smart_status": {"passed": False},
|
||||||
|
"temperature": {"current": 35},
|
||||||
|
"power_on_time": {"hours": 5000},
|
||||||
|
"ata_smart_attributes": {"table": [
|
||||||
|
{"id": 5, "name": "Reallocated_Sector_Ct", "value": 80, "raw": {"value": 12}},
|
||||||
|
{"id": 177, "name": "Wear_Leveling_Count", "value": 88, "raw": {"value": 300}},
|
||||||
|
{"id": 241, "name": "Total_LBAs_Written", "value": 99, "raw": {"value": 2_000_000_000}},
|
||||||
|
]},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ParseTests(unittest.TestCase):
|
||||||
|
def test_nvme_parse(self):
|
||||||
|
d = drives.parse("/dev/nvme0", _NVME_OK)
|
||||||
|
self.assertEqual(d.kind, "nvme")
|
||||||
|
self.assertTrue(d.passed)
|
||||||
|
self.assertEqual(d.percent_used, 3)
|
||||||
|
self.assertEqual(d.health_pct, 97) # 100 - percentage_used
|
||||||
|
self.assertEqual(d.power_on_hours, 1234)
|
||||||
|
self.assertEqual(d.temp_c, 41)
|
||||||
|
self.assertAlmostEqual(d.data_written_tb, 102.4, places=1)
|
||||||
|
|
||||||
|
def test_sata_parse(self):
|
||||||
|
d = drives.parse("/dev/sda", _SATA_FAILING)
|
||||||
|
self.assertEqual(d.kind, "sata")
|
||||||
|
self.assertFalse(d.passed)
|
||||||
|
self.assertEqual(d.reallocated, 12) # raw value
|
||||||
|
self.assertEqual(d.health_pct, 88) # normalized wear-leveling value
|
||||||
|
self.assertAlmostEqual(d.data_written_tb, 1.02, places=1)
|
||||||
|
|
||||||
|
def test_needs_root_when_no_data(self):
|
||||||
|
d = drives.parse("/dev/sda", None)
|
||||||
|
self.assertTrue(d.needs_root)
|
||||||
|
|
||||||
|
def test_roundtrip_through_dicts(self):
|
||||||
|
d = drives.parse("/dev/nvme0", _NVME_OK)
|
||||||
|
back = drives.from_dicts([asdict(d)])
|
||||||
|
self.assertEqual(len(back), 1)
|
||||||
|
self.assertEqual(back[0].model, d.model)
|
||||||
|
self.assertEqual(back[0].health_pct, d.health_pct)
|
||||||
|
|
||||||
|
|
||||||
|
class FindingTests(unittest.TestCase):
|
||||||
|
def test_healthy_nvme_is_ok_with_stats(self):
|
||||||
|
f = drives.to_findings([drives.parse("/dev/nvme0", _NVME_OK)])[0]
|
||||||
|
self.assertEqual(f.severity, OK)
|
||||||
|
self.assertIn("97% life left", f.title)
|
||||||
|
self.assertIn("1,234 h", f.title)
|
||||||
|
|
||||||
|
def test_failing_sata_is_critical(self):
|
||||||
|
f = drives.to_findings([drives.parse("/dev/sda", _SATA_FAILING)])[0]
|
||||||
|
self.assertEqual(f.severity, CRITICAL)
|
||||||
|
self.assertIn("FAILED", f.detail)
|
||||||
|
self.assertIn("reallocated sectors", f.detail)
|
||||||
|
|
||||||
|
def test_worn_nvme_is_warning(self):
|
||||||
|
f = drives.to_findings([drives.parse("/dev/nvme1", _NVME_WORN)])[0]
|
||||||
|
self.assertEqual(f.severity, WARNING)
|
||||||
|
self.assertIn("worn", f.title)
|
||||||
|
|
||||||
|
def test_needs_root_is_info(self):
|
||||||
|
f = drives.to_findings([drives.parse("/dev/sda", None)])[0]
|
||||||
|
self.assertEqual(f.severity, INFO)
|
||||||
|
self.assertIn("needs root", f.title)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -47,6 +47,36 @@ class CollectTests(unittest.TestCase):
|
|||||||
self.assertEqual(gamelogs.collect(), "")
|
self.assertEqual(gamelogs.collect(), "")
|
||||||
|
|
||||||
|
|
||||||
|
class CustomGameLogTests(unittest.TestCase):
|
||||||
|
def test_collect_includes_custom_game_logs(self):
|
||||||
|
tmp = Path(tempfile.mkdtemp())
|
||||||
|
(tmp / "tarkov-latest.log").write_text(">>> Tarkov gone. clean exit")
|
||||||
|
(tmp / "server-latest.log").write_text("SPT server error: mod failed to load")
|
||||||
|
with mock.patch.object(gamelogs, "_proton_logs", return_value=[]), \
|
||||||
|
mock.patch.object(gamelogs, "_steam_console", return_value=None), \
|
||||||
|
mock.patch("rigdoctor.core.customgames.log_dir", return_value=str(tmp)):
|
||||||
|
out = gamelogs.collect(game="SPT")
|
||||||
|
self.assertIn("SPT log", out)
|
||||||
|
self.assertIn("server-latest.log", out)
|
||||||
|
self.assertIn("mod failed to load", out)
|
||||||
|
|
||||||
|
def test_custom_logs_skipped_when_stale(self):
|
||||||
|
tmp = Path(tempfile.mkdtemp())
|
||||||
|
old = tmp / "tarkov-latest.log"
|
||||||
|
old.write_text("an earlier session")
|
||||||
|
old_mtime = time.time() - 3600
|
||||||
|
os.utime(old, (old_mtime, old_mtime))
|
||||||
|
with mock.patch.object(gamelogs, "_proton_logs", return_value=[]), \
|
||||||
|
mock.patch.object(gamelogs, "_steam_console", return_value=None), \
|
||||||
|
mock.patch("rigdoctor.core.customgames.log_dir", return_value=str(tmp)):
|
||||||
|
self.assertEqual(gamelogs.collect(since=time.time() - 60, game="SPT"), "")
|
||||||
|
|
||||||
|
def test_no_game_means_no_custom_logs(self):
|
||||||
|
with mock.patch.object(gamelogs, "_proton_logs", return_value=[]), \
|
||||||
|
mock.patch.object(gamelogs, "_steam_console", return_value=None):
|
||||||
|
self.assertEqual(gamelogs.collect(), "") # game=None → custom lookup skipped
|
||||||
|
|
||||||
|
|
||||||
class SinceScopingTests(unittest.TestCase):
|
class SinceScopingTests(unittest.TestCase):
|
||||||
def test_since_filter_keeps_window_only(self):
|
def test_since_filter_keeps_window_only(self):
|
||||||
text = (
|
text = (
|
||||||
|
|||||||
+108
-1
@@ -1,8 +1,28 @@
|
|||||||
"""Tests for the M4 health report's log scanner (synthetic input)."""
|
"""Tests for the M4 health report's log scanner (synthetic input)."""
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
from rigdoctor.core.health import CRITICAL, WARNING, run_health_checks, scan_journal_text
|
from rigdoctor.core import displays, health
|
||||||
|
from rigdoctor.core.health import (
|
||||||
|
CRITICAL,
|
||||||
|
INFO,
|
||||||
|
WARNING,
|
||||||
|
check_displays,
|
||||||
|
check_memory_speed,
|
||||||
|
check_nvidia_module,
|
||||||
|
check_pcie_links,
|
||||||
|
run_health_checks,
|
||||||
|
scan_journal_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# A real no-Xid freeze: the open-module VA-space storm captured on 2026-05-29.
|
||||||
|
_VASPACE_LOG = """\
|
||||||
|
NVRM: nvCheckFailedNoLog: Check failed: 0 == (pMapNode->gpuMask & gpuMask) @ gpu_vaspace.c:4547
|
||||||
|
NVRM: dmaAllocMapping_GM107: can't update VA space for mapping @vaddr=0x4be00000
|
||||||
|
[drm:nv_drm_gem_alloc_nvkms_memory_ioctl [nvidia_drm]] *ERROR* Failed to allocate NVKMS memory for GEM object
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class HealthScanTests(unittest.TestCase):
|
class HealthScanTests(unittest.TestCase):
|
||||||
@@ -32,6 +52,28 @@ class HealthScanTests(unittest.TestCase):
|
|||||||
def test_clean_text_yields_no_findings(self):
|
def test_clean_text_yields_no_findings(self):
|
||||||
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
|
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
|
||||||
|
|
||||||
|
def test_vaspace_freeze_detected_without_any_xid(self):
|
||||||
|
findings = scan_journal_text(_VASPACE_LOG)
|
||||||
|
gpu = [f for f in findings if f.category == "GPU"]
|
||||||
|
self.assertEqual(len(gpu), 1)
|
||||||
|
self.assertEqual(gpu[0].severity, WARNING)
|
||||||
|
self.assertIn("VA-space", gpu[0].title)
|
||||||
|
# It must NOT be misreported as an Xid finding (the log has no Xid at all).
|
||||||
|
self.assertNotIn("Xid", gpu[0].title)
|
||||||
|
self.assertIn("open kernel module", gpu[0].detail.lower())
|
||||||
|
|
||||||
|
def test_open_module_finding_when_open_loaded(self):
|
||||||
|
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=True):
|
||||||
|
findings = check_nvidia_module()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, INFO)
|
||||||
|
self.assertEqual(findings[0].category, "Driver")
|
||||||
|
|
||||||
|
def test_no_module_finding_when_proprietary_or_absent(self):
|
||||||
|
for state in (False, None):
|
||||||
|
with mock.patch("rigdoctor.core.health._nvidia_module_is_open", return_value=state):
|
||||||
|
self.assertEqual(check_nvidia_module(), [])
|
||||||
|
|
||||||
def test_run_health_checks_returns_findings(self):
|
def test_run_health_checks_returns_findings(self):
|
||||||
# Runs against the real system; just assert it returns a sorted list of Findings.
|
# Runs against the real system; just assert it returns a sorted list of Findings.
|
||||||
findings = run_health_checks()
|
findings = run_health_checks()
|
||||||
@@ -42,5 +84,70 @@ class HealthScanTests(unittest.TestCase):
|
|||||||
self.assertEqual(ranks, sorted(ranks))
|
self.assertEqual(ranks, sorted(ranks))
|
||||||
|
|
||||||
|
|
||||||
|
class PcieLinkCheckTests(unittest.TestCase):
|
||||||
|
def _with_link(self, cur_g, cur_w, max_g, max_w):
|
||||||
|
# one fake NVMe controller returning the given link tuple
|
||||||
|
return (mock.patch("rigdoctor.core.inventory.nvme_controllers",
|
||||||
|
return_value=[("nvme0", Path("/x"))]),
|
||||||
|
mock.patch("rigdoctor.core.inventory.read_link",
|
||||||
|
return_value=(cur_g, cur_w, max_g, max_w)))
|
||||||
|
|
||||||
|
def test_reduced_width_is_a_warning_about_lane_sharing(self):
|
||||||
|
ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4
|
||||||
|
with ctrls, link:
|
||||||
|
findings = check_pcie_links()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, WARNING)
|
||||||
|
self.assertIn("lane-sharing", findings[0].detail)
|
||||||
|
|
||||||
|
def test_reduced_speed_only_is_info(self):
|
||||||
|
ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4
|
||||||
|
with ctrls, link:
|
||||||
|
findings = check_pcie_links()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, INFO)
|
||||||
|
|
||||||
|
def test_full_speed_no_finding(self):
|
||||||
|
ctrls, link = self._with_link(4, "4", 4, "4")
|
||||||
|
with ctrls, link:
|
||||||
|
self.assertEqual(check_pcie_links(), [])
|
||||||
|
|
||||||
|
|
||||||
|
class DisplayCheckTests(unittest.TestCase):
|
||||||
|
def test_lower_than_max_refresh_is_flagged(self):
|
||||||
|
mon = displays.Monitor("DP-1", "Samsung LC34G55T", 3440, 1440, 60.0, 165.0)
|
||||||
|
with mock.patch("rigdoctor.core.displays.collect", return_value=[mon]):
|
||||||
|
findings = check_displays()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, INFO)
|
||||||
|
self.assertIn("165", findings[0].title)
|
||||||
|
|
||||||
|
def test_at_max_refresh_no_finding(self):
|
||||||
|
mon = displays.Monitor("DP-1", "Samsung LC34G55T", 3440, 1440, 165.0, 165.0)
|
||||||
|
with mock.patch("rigdoctor.core.displays.collect", return_value=[mon]):
|
||||||
|
self.assertEqual(check_displays(), [])
|
||||||
|
|
||||||
|
|
||||||
|
class MemorySpeedCheckTests(unittest.TestCase):
|
||||||
|
def _dmi(self, configured, part):
|
||||||
|
return {"memory": [{"Configured Memory Speed": configured, "Speed": configured,
|
||||||
|
"Part Number": part}]}
|
||||||
|
|
||||||
|
def test_flags_unapplied_expo(self):
|
||||||
|
dmi = self._dmi("4800 MT/s", "CMK32GX5M2B5600Z36")
|
||||||
|
with mock.patch("rigdoctor.core.elevation.privileged", return_value=None), \
|
||||||
|
mock.patch("rigdoctor.core.inventory._dmidecode", return_value=dmi):
|
||||||
|
findings = check_memory_speed()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, INFO)
|
||||||
|
self.assertIn("5600", findings[0].title)
|
||||||
|
|
||||||
|
def test_no_flag_at_rated(self):
|
||||||
|
dmi = self._dmi("5600 MT/s", "CMK32GX5M2B5600Z36")
|
||||||
|
with mock.patch("rigdoctor.core.elevation.privileged", return_value=None), \
|
||||||
|
mock.patch("rigdoctor.core.inventory._dmidecode", return_value=dmi):
|
||||||
|
self.assertEqual(check_memory_speed(), [])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
"""Tests for the M5 system inventory (render + dict round-trip; collect on real system)."""
|
"""Tests for the M5 system inventory (render + dict round-trip; collect on real system)."""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from rigdoctor.core import inventory
|
from rigdoctor.core import inventory
|
||||||
from rigdoctor.core.inventory import Section
|
from rigdoctor.core.inventory import Section
|
||||||
@@ -26,5 +28,49 @@ class InventoryTests(unittest.TestCase):
|
|||||||
self.assertIn("- **Model:** Test CPU", md)
|
self.assertIn("- **Model:** Test CPU", md)
|
||||||
|
|
||||||
|
|
||||||
|
class PcieLinkTests(unittest.TestCase):
|
||||||
|
def test_gen_mapping(self):
|
||||||
|
self.assertEqual(inventory._gen("16.0 GT/s PCIe"), 4)
|
||||||
|
self.assertEqual(inventory._gen("8.0 GT/s PCIe"), 3)
|
||||||
|
self.assertIsNone(inventory._gen(""))
|
||||||
|
|
||||||
|
def _fake_dev(self, cur_s, cur_w, max_s, max_w) -> Path:
|
||||||
|
d = Path(tempfile.mkdtemp())
|
||||||
|
(d / "current_link_speed").write_text(cur_s)
|
||||||
|
(d / "current_link_width").write_text(cur_w)
|
||||||
|
(d / "max_link_speed").write_text(max_s)
|
||||||
|
(d / "max_link_width").write_text(max_w)
|
||||||
|
return d
|
||||||
|
|
||||||
|
def test_link_at_full_speed(self):
|
||||||
|
dev = self._fake_dev("16.0 GT/s PCIe", "4", "16.0 GT/s PCIe", "4")
|
||||||
|
self.assertEqual(inventory._link_desc(dev), "PCIe Gen4 x4")
|
||||||
|
|
||||||
|
def test_link_downtrained_flags_capability(self):
|
||||||
|
dev = self._fake_dev("8.0 GT/s PCIe", "4", "16.0 GT/s PCIe", "4")
|
||||||
|
self.assertEqual(inventory._link_desc(dev), "PCIe Gen3 x4 (capable of Gen4 x4)")
|
||||||
|
|
||||||
|
def test_non_nvme_has_no_link(self):
|
||||||
|
self.assertEqual(inventory._nvme_link("sda"), "")
|
||||||
|
|
||||||
|
|
||||||
|
class MemorySpeedTests(unittest.TestCase):
|
||||||
|
def test_rated_speed_from_part_number(self):
|
||||||
|
self.assertEqual(inventory._rated_from_part("CMK32GX5M2B5600Z36"), 5600)
|
||||||
|
self.assertEqual(inventory._rated_from_part("F5-6000J3038F16G"), 6000)
|
||||||
|
self.assertIsNone(inventory._rated_from_part("NoSpeedHere"))
|
||||||
|
|
||||||
|
def test_detects_unapplied_expo(self):
|
||||||
|
# XMP/EXPO off: dmidecode only sees JEDEC 4800; the 5600 is in the part number.
|
||||||
|
m = {"Configured Memory Speed": "4800 MT/s", "Speed": "4800 MT/s",
|
||||||
|
"Part Number": "CMK32GX5M2B5600Z36"}
|
||||||
|
self.assertEqual(inventory.module_speed(m), (4800, 5600))
|
||||||
|
|
||||||
|
def test_at_rated_speed(self):
|
||||||
|
m = {"Configured Memory Speed": "5600 MT/s", "Speed": "5600 MT/s",
|
||||||
|
"Part Number": "CMK32GX5M2B5600Z36"}
|
||||||
|
self.assertEqual(inventory.module_speed(m), (5600, 5600))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -0,0 +1,163 @@
|
|||||||
|
"""Tests for the .dmp minidump parser (M14) — builds a synthetic MDMP, no external tools."""
|
||||||
|
|
||||||
|
import struct
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from rigdoctor.core import minidump
|
||||||
|
|
||||||
|
|
||||||
|
def _synthetic_dump() -> bytes:
|
||||||
|
"""A minimal but valid MDMP: header + SystemInfo + Exception + 2-module ModuleList.
|
||||||
|
|
||||||
|
Layout (absolute file offsets): header@0, directory@32, SystemInfo@68, Exception@96,
|
||||||
|
ModuleList@264, name strings@484. Module0 spans the exception address, so it's faulting.
|
||||||
|
"""
|
||||||
|
buf = bytearray(600)
|
||||||
|
struct.pack_into("<4sIIIIIQ", buf, 0, b"MDMP", 0xA793, 3, 32, 0, 1_700_000_000, 0)
|
||||||
|
struct.pack_into("<III", buf, 32, 7, 28, 68) # SystemInfoStream
|
||||||
|
struct.pack_into("<III", buf, 44, 6, 168, 96) # ExceptionStream
|
||||||
|
struct.pack_into("<III", buf, 56, 4, 220, 264) # ModuleListStream
|
||||||
|
|
||||||
|
# SystemInfo: x86-64, 16 CPUs, Windows 10.0.19041 (PlatformId 2 = Win32 NT).
|
||||||
|
struct.pack_into("<HHHBBIIIII", buf, 68, 9, 0, 0, 16, 1, 10, 0, 19041, 2, 0)
|
||||||
|
|
||||||
|
# Exception: access violation (write) at 0x140001234.
|
||||||
|
struct.pack_into("<I", buf, 96, 4321) # ThreadId
|
||||||
|
struct.pack_into("<I", buf, 96 + 8, 0xC0000005) # ExceptionCode
|
||||||
|
struct.pack_into("<Q", buf, 96 + 24, 0x140001234) # ExceptionAddress
|
||||||
|
struct.pack_into("<I", buf, 96 + 32, 2) # NumberParameters
|
||||||
|
struct.pack_into("<Q", buf, 96 + 40, 1) # info[0] = write
|
||||||
|
struct.pack_into("<Q", buf, 96 + 48, 0x0) # info[1] = faulting address
|
||||||
|
|
||||||
|
# ModuleList: 2 modules.
|
||||||
|
struct.pack_into("<I", buf, 264, 2)
|
||||||
|
m0, m1 = 268, 268 + minidump._MODULE_STRIDE
|
||||||
|
struct.pack_into("<Q", buf, m0, 0x140000000) # base
|
||||||
|
struct.pack_into("<I", buf, m0 + 8, 0x100000) # size (spans the exception address)
|
||||||
|
struct.pack_into("<I", buf, m0 + 20, 484) # name RVA
|
||||||
|
struct.pack_into("<Q", buf, m1, 0x180000000)
|
||||||
|
struct.pack_into("<I", buf, m1 + 8, 0x080000)
|
||||||
|
struct.pack_into("<I", buf, m1 + 20, 522)
|
||||||
|
|
||||||
|
name0 = "C:\\Games\\game.exe".encode("utf-16-le")
|
||||||
|
struct.pack_into("<I", buf, 484, len(name0))
|
||||||
|
buf[488:488 + len(name0)] = name0
|
||||||
|
name1 = "nvwgf2umx.dll".encode("utf-16-le")
|
||||||
|
struct.pack_into("<I", buf, 522, len(name1))
|
||||||
|
buf[526:526 + len(name1)] = name1
|
||||||
|
return bytes(buf)
|
||||||
|
|
||||||
|
|
||||||
|
class ParseTests(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self._tmp = tempfile.NamedTemporaryFile(suffix=".dmp", delete=False)
|
||||||
|
self._tmp.write(_synthetic_dump())
|
||||||
|
self._tmp.close()
|
||||||
|
self.path = self._tmp.name
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
Path(self.path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
def _parse(self):
|
||||||
|
return minidump.parse(self.path, run_stackwalk=False)
|
||||||
|
|
||||||
|
def test_parses_exception_and_faulting_module(self):
|
||||||
|
r = self._parse()
|
||||||
|
self.assertTrue(r.ok, r.error)
|
||||||
|
self.assertEqual(r.exception_code, 0xC0000005)
|
||||||
|
self.assertIn("Access violation", r.crash_reason)
|
||||||
|
self.assertIn("writing 0x0", r.crash_reason)
|
||||||
|
self.assertEqual(r.faulting_module, "game.exe") # basename, address inside module0
|
||||||
|
self.assertEqual(r.crashing_thread, 4321)
|
||||||
|
|
||||||
|
def test_parses_system_info_and_modules(self):
|
||||||
|
r = self._parse()
|
||||||
|
self.assertEqual(r.os_name, "Windows 10.0.19041")
|
||||||
|
self.assertEqual(r.cpu_arch, "x86-64")
|
||||||
|
self.assertEqual(r.cpu_count, 16)
|
||||||
|
self.assertEqual([m.name for m in r.modules], ["game.exe", "nvwgf2umx.dll"])
|
||||||
|
|
||||||
|
def test_to_text_and_ai_text(self):
|
||||||
|
r = self._parse()
|
||||||
|
text = minidump.to_text(r)
|
||||||
|
self.assertIn("game.exe", text)
|
||||||
|
self.assertIn("nvwgf2umx.dll", text)
|
||||||
|
self.assertIn("Access violation", text)
|
||||||
|
ai_text = minidump.to_ai_text(r)
|
||||||
|
self.assertIn("Proton", ai_text) # Linux/Proton framing for the model
|
||||||
|
self.assertIn("Crash reason", ai_text)
|
||||||
|
|
||||||
|
def test_to_findings(self):
|
||||||
|
findings = minidump.to_findings(self._parse())
|
||||||
|
self.assertEqual(findings[0].severity, minidump.CRITICAL)
|
||||||
|
self.assertIn("game.exe", findings[0].title)
|
||||||
|
|
||||||
|
def test_run_stackwalk_false_skips_external_tool(self):
|
||||||
|
self.assertEqual(self._parse().stackwalk, "")
|
||||||
|
|
||||||
|
|
||||||
|
class RobustnessTests(unittest.TestCase):
|
||||||
|
def test_non_minidump_file(self):
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".dmp", delete=False) as fh:
|
||||||
|
fh.write(b"not a dump at all")
|
||||||
|
path = fh.name
|
||||||
|
try:
|
||||||
|
r = minidump.parse(path, run_stackwalk=False)
|
||||||
|
finally:
|
||||||
|
Path(path).unlink(missing_ok=True)
|
||||||
|
self.assertFalse(r.ok)
|
||||||
|
self.assertIn("signature", r.error)
|
||||||
|
|
||||||
|
def test_missing_file(self):
|
||||||
|
r = minidump.parse("/nonexistent/does-not-exist.dmp", run_stackwalk=False)
|
||||||
|
self.assertFalse(r.ok)
|
||||||
|
self.assertIn("can't read", r.error)
|
||||||
|
|
||||||
|
def test_stackwalk_absent_returns_empty(self):
|
||||||
|
with mock.patch.object(minidump.shutil, "which", return_value=None):
|
||||||
|
self.assertEqual(minidump.stackwalk("/whatever.dmp"), "")
|
||||||
|
|
||||||
|
|
||||||
|
class CliDumpTests(unittest.TestCase):
|
||||||
|
"""`rigdoctor ai dump <file>` parses then explains via the configured provider."""
|
||||||
|
|
||||||
|
def _args(self, **over):
|
||||||
|
import argparse
|
||||||
|
base = {"ai_cmd": "dump", "file": ""}
|
||||||
|
base.update(over)
|
||||||
|
return argparse.Namespace(**base)
|
||||||
|
|
||||||
|
def test_dump_parses_and_explains(self):
|
||||||
|
from rigdoctor.core import ai
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".dmp", delete=False) as fh:
|
||||||
|
fh.write(_synthetic_dump())
|
||||||
|
path = fh.name
|
||||||
|
try:
|
||||||
|
with mock.patch.object(ai, "is_configured", return_value=True), \
|
||||||
|
mock.patch.object(ai, "provider_label", return_value="Claude (test)"), \
|
||||||
|
mock.patch.object(minidump, "stackwalk", return_value=""), \
|
||||||
|
mock.patch.object(ai, "explain", return_value=(True, "Likely DXVK.")) as explain:
|
||||||
|
from rigdoctor import cli
|
||||||
|
rc = cli.cmd_ai(self._args(file=path))
|
||||||
|
finally:
|
||||||
|
Path(path).unlink(missing_ok=True)
|
||||||
|
self.assertEqual(rc, 0)
|
||||||
|
sent = explain.call_args[0][0]
|
||||||
|
self.assertIn("Proton", sent) # the Linux/Proton framing reached the model
|
||||||
|
self.assertIn("game.exe", sent)
|
||||||
|
|
||||||
|
def test_dump_bad_file_returns_error(self):
|
||||||
|
from rigdoctor.core import ai
|
||||||
|
|
||||||
|
with mock.patch.object(ai, "is_configured", return_value=True):
|
||||||
|
from rigdoctor import cli
|
||||||
|
rc = cli.cmd_ai(self._args(file="/nope/missing.dmp"))
|
||||||
|
self.assertEqual(rc, 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
"""Tests for the GPU stress + thermal-monitor analysis (synthetic ticks, no real GPU)."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from rigdoctor.core import stress
|
||||||
|
from rigdoctor.core.health import CRITICAL, OK, WARNING
|
||||||
|
|
||||||
|
|
||||||
|
def _tick(temp=None, power=None, throttle=(), capped=False, lost=False, dt=1.0, **extra):
|
||||||
|
values = {}
|
||||||
|
if temp is not None:
|
||||||
|
values["gpu.temp"] = temp
|
||||||
|
if power is not None:
|
||||||
|
values["gpu.power"] = power
|
||||||
|
values.update(extra)
|
||||||
|
return stress._Tick(dt=dt, values=values, throttle=list(throttle), power_capped=capped, lost=lost)
|
||||||
|
|
||||||
|
|
||||||
|
class SummarizeTests(unittest.TestCase):
|
||||||
|
def test_stable_run_is_ok(self):
|
||||||
|
ticks = [_tick(temp=t, power=200, **{"gpu.power_limit": 280}) for t in (60, 65, 70, 72)]
|
||||||
|
r = stress.summarize(ticks, load="monitor-only", interval=1.0, faults=[])
|
||||||
|
self.assertEqual(r.severity, OK)
|
||||||
|
self.assertEqual(r.peak_temp, 72)
|
||||||
|
self.assertEqual(r.max_power, 200)
|
||||||
|
self.assertEqual(r.power_limit, 280)
|
||||||
|
self.assertFalse(r.throttled)
|
||||||
|
self.assertIn("Stable", r.verdict)
|
||||||
|
|
||||||
|
def test_dwell_time_above_thresholds(self):
|
||||||
|
# 3 ticks of 2s each at 82/86/92 °C → ≥80 for all 6s, ≥85 for 4s, ≥90 for 2s.
|
||||||
|
ticks = [_tick(temp=82, dt=2.0), _tick(temp=86, dt=2.0), _tick(temp=92, dt=2.0)]
|
||||||
|
r = stress.summarize(ticks, load="x", interval=2.0, faults=[])
|
||||||
|
self.assertEqual(r.time_above[80], 6.0)
|
||||||
|
self.assertEqual(r.time_above[85], 4.0)
|
||||||
|
self.assertEqual(r.time_above[90], 2.0)
|
||||||
|
self.assertNotIn(95, r.time_above) # never reached → omitted
|
||||||
|
|
||||||
|
def test_throttling_is_a_warning(self):
|
||||||
|
ticks = [_tick(temp=88, throttle=["HW thermal slowdown"])]
|
||||||
|
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
|
||||||
|
self.assertEqual(r.severity, WARNING)
|
||||||
|
self.assertTrue(r.throttled)
|
||||||
|
self.assertIn("HW thermal slowdown", r.throttle_reasons)
|
||||||
|
|
||||||
|
def test_high_temp_without_throttle_is_a_warning(self):
|
||||||
|
r = stress.summarize([_tick(temp=93)], load="x", interval=1.0, faults=[])
|
||||||
|
self.assertEqual(r.severity, WARNING)
|
||||||
|
self.assertIn("hot", r.verdict.lower())
|
||||||
|
|
||||||
|
def test_gpu_lost_is_critical(self):
|
||||||
|
ticks = [_tick(temp=70), _tick(lost=True)]
|
||||||
|
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
|
||||||
|
self.assertEqual(r.severity, CRITICAL)
|
||||||
|
self.assertTrue(r.gpu_lost)
|
||||||
|
|
||||||
|
def test_journal_fault_is_critical(self):
|
||||||
|
r = stress.summarize([_tick(temp=70)], load="x", interval=1.0,
|
||||||
|
faults=["NVIDIA Xid 79 ×1"])
|
||||||
|
self.assertEqual(r.severity, CRITICAL)
|
||||||
|
self.assertIn("Xid 79", r.verdict)
|
||||||
|
|
||||||
|
def test_no_telemetry_is_info(self):
|
||||||
|
r = stress.summarize([_tick()], load="monitor-only", interval=1.0, faults=[])
|
||||||
|
self.assertEqual(r.severity, "info")
|
||||||
|
self.assertIsNone(r.peak_temp)
|
||||||
|
|
||||||
|
|
||||||
|
class ThrottleDecodeTests(unittest.TestCase):
|
||||||
|
def test_throttle_bits_map_to_reasons(self):
|
||||||
|
# the constants used by _throttle_state decode the NVML active-reasons bitmask
|
||||||
|
self.assertIn("HW thermal slowdown", stress._THROTTLE_BITS.values())
|
||||||
|
self.assertIn("SW thermal slowdown", stress._THROTTLE_BITS.values())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
"""Tests for the M13 updater: install detection + routing the update to the right method."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from rigdoctor.core import updates
|
||||||
|
|
||||||
|
|
||||||
|
class InstallKindTests(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
updates.install_kind.cache_clear()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
updates.install_kind.cache_clear()
|
||||||
|
|
||||||
|
def test_apt_when_dpkg_owns_the_package(self):
|
||||||
|
with mock.patch.object(updates, "_dpkg_owns", return_value=True):
|
||||||
|
self.assertEqual(updates.install_kind(), "apt")
|
||||||
|
|
||||||
|
def test_pip_when_running_in_a_venv(self):
|
||||||
|
with mock.patch.object(updates, "_dpkg_owns", return_value=False), \
|
||||||
|
mock.patch.object(updates.sys, "prefix", "/opt/venv"), \
|
||||||
|
mock.patch.object(updates.sys, "base_prefix", "/usr"):
|
||||||
|
self.assertEqual(updates.install_kind(), "pip")
|
||||||
|
|
||||||
|
|
||||||
|
class ApplyUpdateRoutingTests(unittest.TestCase):
|
||||||
|
def test_apt_returns_guidance_and_never_runs_pip(self):
|
||||||
|
with mock.patch.object(updates, "install_kind", return_value="apt"), \
|
||||||
|
mock.patch("subprocess.run") as run:
|
||||||
|
rc, out = updates.apply_update("v9.9.9")
|
||||||
|
self.assertEqual(rc, 1)
|
||||||
|
self.assertIn("apt install --only-upgrade", out)
|
||||||
|
run.assert_not_called()
|
||||||
|
|
||||||
|
def test_dev_returns_guidance_and_never_runs_pip(self):
|
||||||
|
with mock.patch.object(updates, "install_kind", return_value="dev"), \
|
||||||
|
mock.patch("subprocess.run") as run:
|
||||||
|
rc, out = updates.apply_update("v9.9.9")
|
||||||
|
self.assertIn("git pull", out)
|
||||||
|
run.assert_not_called()
|
||||||
|
|
||||||
|
def test_pip_install_runs_pip(self):
|
||||||
|
proc = mock.Mock(returncode=0, stdout="Successfully installed", stderr="")
|
||||||
|
with mock.patch.object(updates, "install_kind", return_value="pip"), \
|
||||||
|
mock.patch.object(updates, "load_token", return_value="TOK"), \
|
||||||
|
mock.patch("subprocess.run", return_value=proc) as run:
|
||||||
|
rc, _out = updates.apply_update("v1.2.3")
|
||||||
|
self.assertEqual(rc, 0)
|
||||||
|
cmd = run.call_args[0][0]
|
||||||
|
self.assertIn("pip", cmd)
|
||||||
|
self.assertIn("install", cmd)
|
||||||
|
|
||||||
|
|
||||||
|
class UpdateHintTests(unittest.TestCase):
|
||||||
|
def test_apt_hint_names_the_apt_command(self):
|
||||||
|
self.assertIn("apt install --only-upgrade rigdoctor", updates.update_hint("apt"))
|
||||||
|
|
||||||
|
def test_dev_hint_says_git_pull(self):
|
||||||
|
self.assertIn("git pull", updates.update_hint("dev"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user