From 4bd51a40c33d3c7f9b262e5e6b5ab6ab1330df77 Mon Sep 17 00:00:00 2001 From: Jessey van Offeren Date: Fri, 22 May 2026 14:16:23 +0200 Subject: [PATCH] =?UTF-8?q?feat(m15):=20nvidia-smi=20snapshot=20+=20displa?= =?UTF-8?q?y=20logs=20+=20inventory=20in=20reports=20=E2=80=94=200.32.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand diagnostic/report collection (all stored per-diagnostic, in the Report zip; logs also fed to the AI on "Explain"): - syslogs: nvidia-smi -q snapshot (driver/throttle/clocks/power/temps/PCIe/ECC/ retired pages) + display-server log auto-detected — Xorg.0.log on X11, or the compositor user-journal slice (gnome-shell/kwin/sway/gamescope) on Wayland. - diagstore: include the full M5 inventory (inventory.txt + .json) — invaluable for larger/shared debugging. inventory.collect() degrades gracefully (no root prompt). Best-effort throughout. - Tests for nvidia/display + inventory in store; docs (M15/SPEC). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 12 +++++ docs/MODULES.md | 13 ++--- docs/SPEC.md | 9 ++-- pyproject.toml | 2 +- src/rigdoctor/__init__.py | 2 +- src/rigdoctor/core/diagstore.py | 9 ++++ src/rigdoctor/core/syslogs.py | 89 +++++++++++++++++++++++++++++---- tests/test_diagstore.py | 4 ++ tests/test_syslogs.py | 53 +++++++++++++++++--- 9 files changed, 166 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f1e04b..b460593 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to RigDoctor are recorded here. Format follows (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git release tag (so the auto-updater, D18, can compare versions). +## [0.32.0] - 2026-05-22 +### Added +- **More for diagnostics & reports:** + - **`nvidia-smi -q` snapshot** — driver, throttle/clock-event reasons, clocks, power, temps, + PCIe link, ECC + retired pages (point-in-time at diagnostic time). + - **Display-server log** — auto-detected: `Xorg.0.log` on X11, or the compositor's user-journal + slice (gnome-shell/kwin/sway/gamescope) on Wayland. + - **Full system inventory** (M5 hardware/OS) is now included in each stored diagnostic and the + **Report** bundle — invaluable for larger/shared debugging. + These join the kernel log + coredump records in `syslogs.txt`/`inventory.*`, are saved per + diagnostic, included in the Report zip, and (logs) fed to the AI on "Explain". + ## [0.31.0] - 2026-05-22 ### Added - **Diagnostics now collect session-scoped system logs** (`core/syslogs.py`): a kernel-log diff --git a/docs/MODULES.md b/docs/MODULES.md index 8ffea2b..d3a5720 100644 --- a/docs/MODULES.md +++ b/docs/MODULES.md @@ -132,12 +132,13 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done - **M15 Logging & report bundles** (D25) — opt-in via one `logging_enabled` toggle (default off): application logging to a rotating `app.log` (`core/applog.py`) and **per-diagnostic storage** - (`core/diagstore.py`) — each diagnostic gets its own `DATA_DIR/diagnostics//` (capture, - `result.json`, `report.txt`, scoped **game logs** (`core/gamelogs.py`) and **system logs** - (`core/syslogs.py` — `journalctl -k` slice + `coredumpctl` crashed-process records), and an - `ai/` record of every AI interaction: exact data sent, model, reply). **"Report"** zips one - into `DATA_DIR/reports/` (GUI button on the diagnostic dialog; CLI `rigdoctor bundle`). All - logs are session-scoped and fed to the AI on "Explain". Stays local; shareable on demand. + (`core/diagstore.py`) — each diagnostic gets its own `DATA_DIR/diagnostics//`: capture, + `result.json`, `report.txt`, the full **inventory** (M5: hardware/OS), scoped **game logs** + (`core/gamelogs.py`), scoped **system logs** (`core/syslogs.py` — `journalctl -k`, + `coredumpctl`, an `nvidia-smi -q` snapshot, and the X11/Wayland display-server log), and an + `ai/` record of every AI interaction (exact data sent, model, reply). **"Report"** zips one + into `DATA_DIR/reports/` (GUI button on the diagnostic dialog; CLI `rigdoctor bundle`). Logs + are session-scoped and fed to the AI on "Explain". Stays local; shareable on demand. ## Bundles (final — D14) - **Essential:** M1 + M3 + M4 *(the MVP, NVIDIA-only — D5)* diff --git a/docs/SPEC.md b/docs/SPEC.md index 1b909ea..86c23ca 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -165,10 +165,11 @@ the actual findings plus matched reference facts from a curated, exact-match kno ### M15 — Logging & report bundles (D25) Opt-in (one `logging_enabled` toggle, default off). When on: the application logs to a rotating `app.log`, and **each diagnostic is stored in its own directory** (capture log, structured -result, human-readable report, session-scoped **game logs** (Proton/Steam) and **system logs** -(`journalctl -k` slice + `coredumpctl` crashed-process records), and a record of every AI -interaction — the exact data sent, the model, and its reply). The collected logs are also fed to -the AI on "Explain". System-log collection is best-effort (degrades if tools are missing/denied). A **Report** action zips one diagnostic's directory +result, human-readable report, the full **inventory** (M5 hardware/OS), session-scoped **game +logs** (Proton/Steam) and **system logs** (`journalctl -k`, `coredumpctl`, an `nvidia-smi -q` +snapshot, and the X11/Wayland display-server log), and a record of every AI interaction — the +exact data sent, the model, and its reply). The collected logs are also fed to the AI on +"Explain". Collection is best-effort (degrades if tools are missing/denied). A **Report** action zips one diagnostic's directory (plus the app log) into a shareable bundle saved under the reports folder (GUI button; CLI `rigdoctor bundle`). Everything stays local — a report only leaves the machine if the user shares the zip. Stdlib only (`logging` + `zipfile`). diff --git a/pyproject.toml b/pyproject.toml index 69f5cc5..07bf0a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "rigdoctor" -version = "0.31.0" +version = "0.32.0" description = "Modular hardware monitoring & crash diagnostics for Linux gamers." readme = "README.md" requires-python = ">=3.11" diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py index f537844..36d8de0 100644 --- a/src/rigdoctor/__init__.py +++ b/src/rigdoctor/__init__.py @@ -1,3 +1,3 @@ """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers.""" -__version__ = "0.31.0" +__version__ = "0.32.0" diff --git a/src/rigdoctor/core/diagstore.py b/src/rigdoctor/core/diagstore.py index f4ae4fc..f883182 100644 --- a/src/rigdoctor/core/diagstore.py +++ b/src/rigdoctor/core/diagstore.py @@ -87,6 +87,15 @@ def store(result, capture_path=None, since: float | None = None) -> Path | None: _write(target / "syslogs.txt", sys_logs) except OSError: pass + + try: # full hardware/OS inventory (M5) — invaluable for larger debugging in a shared report + from . import inventory + + sections = inventory.collect() + _write(target / "inventory.txt", inventory.render_text(sections)) + _write(target / "inventory.json", inventory.render_json(sections)) + except Exception: # inventory probes vary by machine; never let it break storage + pass return target diff --git a/src/rigdoctor/core/syslogs.py b/src/rigdoctor/core/syslogs.py index f40999c..4299070 100644 --- a/src/rigdoctor/core/syslogs.py +++ b/src/rigdoctor/core/syslogs.py @@ -1,19 +1,29 @@ -"""Session-scoped system logs for diagnostics (M15): kernel log + crashed-process records. +"""Session-scoped system logs for diagnostics (M15): kernel, coredumps, NVIDIA, display. -Reads the kernel ring buffer slice (`journalctl -k`) and systemd-coredump records -(`coredumpctl`) covering the diagnostic window, so the report bundle and the AI both see what -the *system* logged when something went wrong — Xid, OOM-killer, MCE, PCIe AER, thermal, hung -tasks, and whether a process (the game/wine) actually dumped core (SIGSEGV/ABRT). Best-effort -and size-bounded: degrades silently if the tools are missing or access is denied. Stdlib only. +Covers what the *system* logged when something went wrong, so the report bundle and the AI both +see it: + * kernel ring-buffer slice (`journalctl -k`) — Xid, OOM-killer, MCE, PCIe AER, thermal, hung tasks + * systemd-coredump records (`coredumpctl`) — did the game/wine dump core (SIGSEGV/ABRT), when + * an `nvidia-smi -q` snapshot — driver, throttle/clock-event reasons, clocks, power, temps, PCIe, + ECC + retired pages (point-in-time at diagnostic time) + * the display-server log — `Xorg.0.log` on X11, or the compositor's user-journal slice on Wayland +Best-effort and size-bounded: degrades silently if a tool is missing or access is denied. Stdlib only. """ from __future__ import annotations +import os import shutil import subprocess import time +from pathlib import Path -_MAX = 8000 # cap each section so the prompt/report stays small +_MAX = 8000 # cap each log section so the prompt/report stays small +_NV_MAX = 10000 # nvidia-smi -q is structured + valuable; allow a bit more (head-truncated) + +# Compositors whose user-journal entries are the "Wayland log" (OR-matched by journalctl). +_COMPOSITORS = ("gnome-shell", "mutter", "kwin_wayland", "Xwayland", "sway", "gamescope") +_XORG_LOGS = ("~/.local/share/xorg/Xorg.0.log", "/var/log/Xorg.0.log") def _since_arg(since: float | None) -> str | None: @@ -54,12 +64,67 @@ def coredumps(since: float | None = None, max_bytes: int = _MAX) -> str: return out[-max_bytes:] +def nvidia_snapshot(max_bytes: int = _NV_MAX) -> str: + """Point-in-time `nvidia-smi -q` (head-truncated — driver/temps/clocks/ECC sit near the top).""" + if not shutil.which("nvidia-smi"): + return "" + out = _run(["nvidia-smi", "-q"]) + return out[:max_bytes] if out else "" + + +def _xorg_log() -> Path | None: + for cand in _XORG_LOGS: + path = Path(os.path.expanduser(cand)) + if path.exists(): + return path + return None + + +def _session_type() -> str: + declared = os.environ.get("XDG_SESSION_TYPE", "").lower() + if declared in ("x11", "wayland"): + return declared + if os.environ.get("WAYLAND_DISPLAY"): + return "wayland" + return "x11" if _xorg_log() else "unknown" + + +def _tail_file(path: Path, max_bytes: int) -> str: + try: + size = path.stat().st_size + with path.open("rb") as fh: + if size > max_bytes: + fh.seek(size - max_bytes) + return fh.read().decode("utf-8", "replace") + except OSError: + return "" + + +def display_log(since: float | None = None, max_bytes: int = _MAX) -> str: + """Xorg.0.log on X11, or the compositor's user-journal slice on Wayland ('' if none).""" + if _session_type() == "wayland": + if not shutil.which("journalctl"): + return "" + cmd = ["journalctl", "--user", "--no-pager"] + since_arg = _since_arg(since) + if since_arg: + cmd += ["--since", since_arg] + cmd += [f"_COMM={comp}" for comp in _COMPOSITORS] # OR-matched + out = _run(cmd) + if not out or out.strip().lower() == "-- no entries --": + return "" + return out[-max_bytes:] + log = _xorg_log() # X11: Xorg log isn't wall-clock-timestamped, so tail rather than scope + return _tail_file(log, max_bytes) if log else "" + + def available() -> bool: - return bool(shutil.which("journalctl") or shutil.which("coredumpctl")) + return bool(shutil.which("journalctl") or shutil.which("coredumpctl") + or shutil.which("nvidia-smi") or _xorg_log()) def collect(since: float | None = None) -> str: - """Kernel-log slice + crashed-process records as one labelled block ('' if none).""" + """Kernel + coredumps + NVIDIA snapshot + display log as one labelled block ('' if none).""" sections: list[str] = [] kern = kernel_log(since) if kern: @@ -67,4 +132,10 @@ def collect(since: float | None = None) -> str: cores = coredumps(since) if cores: sections.append(f"--- Crashed processes (coredumpctl) ---\n{cores}") + nvidia = nvidia_snapshot() + if nvidia: + sections.append(f"--- NVIDIA snapshot (nvidia-smi -q) ---\n{nvidia}") + display = display_log(since) + if display: + sections.append(f"--- Display server log ({_session_type()}) ---\n{display}") return "\n\n".join(sections) diff --git a/tests/test_diagstore.py b/tests/test_diagstore.py index 77097e8..baceb51 100644 --- a/tests/test_diagstore.py +++ b/tests/test_diagstore.py @@ -47,11 +47,15 @@ class StoreTests(unittest.TestCase): with mock.patch.object(diagstore, "enabled", return_value=True), \ mock.patch("rigdoctor.render.render_summary", return_value="SUMMARY-TEXT"), \ mock.patch("rigdoctor.core.gamelogs.collect", return_value="LOG-TEXT"), \ + mock.patch("rigdoctor.core.syslogs.collect", return_value="SYS-LOG"), \ + mock.patch("rigdoctor.core.inventory.collect", return_value=[]), \ mock.patch.object(diagstore.config, "DIAGNOSTICS_DIR", self.tmp / "diagnostics"): directory = diagstore.store(FakeResult()) self.assertTrue((directory / "result.json").exists()) self.assertTrue((directory / "report.txt").exists()) self.assertEqual((directory / "gamelogs.txt").read_text(), "LOG-TEXT") + self.assertEqual((directory / "syslogs.txt").read_text(), "SYS-LOG") + self.assertTrue((directory / "inventory.txt").exists()) # inventory included for debugging data = json.loads((directory / "result.json").read_text()) self.assertEqual(data["game"], "Path of Exile 2") self.assertEqual(len(data["findings"]), 1) diff --git a/tests/test_syslogs.py b/tests/test_syslogs.py index 4e71056..24ce0be 100644 --- a/tests/test_syslogs.py +++ b/tests/test_syslogs.py @@ -34,19 +34,60 @@ class CoredumpTests(unittest.TestCase): self.assertIn("PathOfExile", out) +class NvidiaTests(unittest.TestCase): + def test_missing_tool(self): + with mock.patch("shutil.which", return_value=None): + self.assertEqual(syslogs.nvidia_snapshot(), "") + + def test_snapshot_head_truncated(self): + with mock.patch("shutil.which", return_value="/usr/bin/nvidia-smi"), \ + mock.patch.object(syslogs, "_run", return_value="DRIVER\n" + "x" * 99999): + out = syslogs.nvidia_snapshot(max_bytes=10) + self.assertEqual(out, "DRIVER\nxxx") # head, not tail + + +class DisplayTests(unittest.TestCase): + def test_session_type_env(self): + with mock.patch.dict("os.environ", {"XDG_SESSION_TYPE": "wayland"}): + self.assertEqual(syslogs._session_type(), "wayland") + + def test_x11_tails_xorg_log(self): + import tempfile + from pathlib import Path + log = Path(tempfile.mkdtemp()) / "Xorg.0.log" + log.write_text("(EE) NVIDIA(GPU-0): something failed") + with mock.patch.object(syslogs, "_session_type", return_value="x11"), \ + mock.patch.object(syslogs, "_xorg_log", return_value=log): + out = syslogs.display_log() + self.assertIn("(EE) NVIDIA", out) + + def test_wayland_uses_user_journal(self): + with mock.patch.object(syslogs, "_session_type", return_value="wayland"), \ + mock.patch("shutil.which", return_value="/usr/bin/journalctl"), \ + mock.patch.object(syslogs, "_run", return_value="gnome-shell: GPU error") as run: + out = syslogs.display_log(since=1_000_000_000) + self.assertIn("GPU error", out) + cmd = run.call_args[0][0] + self.assertIn("--user", cmd) + self.assertTrue(any(a.startswith("_COMM=") for a in cmd)) + + class CollectTests(unittest.TestCase): def test_collect_combines_sections(self): with mock.patch.object(syslogs, "kernel_log", return_value="NVRM: Xid 79"), \ - mock.patch.object(syslogs, "coredumps", return_value="game SIGSEGV"): + mock.patch.object(syslogs, "coredumps", return_value="game SIGSEGV"), \ + mock.patch.object(syslogs, "nvidia_snapshot", return_value="Driver Version 595"), \ + mock.patch.object(syslogs, "display_log", return_value="(EE) NVIDIA"): out = syslogs.collect() - self.assertIn("Kernel log", out) - self.assertIn("Xid 79", out) - self.assertIn("Crashed processes", out) - self.assertIn("SIGSEGV", out) + for needle in ("Kernel log", "Xid 79", "Crashed processes", "SIGSEGV", + "NVIDIA snapshot", "595", "Display server log"): + self.assertIn(needle, out) def test_collect_empty_when_nothing(self): with mock.patch.object(syslogs, "kernel_log", return_value=""), \ - mock.patch.object(syslogs, "coredumps", return_value=""): + mock.patch.object(syslogs, "coredumps", return_value=""), \ + mock.patch.object(syslogs, "nvidia_snapshot", return_value=""), \ + mock.patch.object(syslogs, "display_log", return_value=""): self.assertEqual(syslogs.collect(), "")