diff --git a/CHANGELOG.md b/CHANGELOG.md index 5133865..9d3675f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,27 @@ All notable changes to RigDoctor are recorded here. Format follows (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git release tag (so the auto-updater, D18, can compare versions). +## [0.0.5] - 2026-05-21 +### Added +- **M9 installer (first cut)**: detects distro / package manager / GPU; a catalog of optional + components (smartmontools, lm-sensors, dmidecode, pciutils, libnotify) with what each + enables; `rigdoctor install [--check] [-y]` installs missing apt packages via pkexec/sudo + with consent; GUI **Setup** tab with one-click install. Fixes the "smartmontools missing" + gap in the health report. +- **Update check (M13, check half)**: on GUI launch the sidebar checks the Gitea releases API + and shows "up-to-date", an "Update to v…" button if a newer release exists, or "update check + unavailable" if the API can't be reached anonymously. + +## [0.0.4] - 2026-05-21 +### Added +- **M4 health report**: scans kernel logs (NVIDIA Xid incl. 79 "fell off the bus", kernel + panic, OOM, MCE, PCIe AER, thermal, amdgpu reset), SMART health, NVIDIA driver/library + mismatch, journald persistence, and live temps β†’ prioritized plain-language findings with + suggested fixes (read-only, D9). +- CLI `rigdoctor report` (text + `--json`). +- GUI **Health** tab: runs checks in the background; findings shown as severity-colored cards. +- Tests for the journal scanner. + ## [0.0.3] - 2026-05-21 ### Added - Show the app version (`v`) in the GUI sidebar. diff --git a/README.md b/README.md index b4e3401..55ce613 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ A **modular diagnostics, monitoring, and health-check toolkit for Linux gamers.** -> **Status:** 🟒 Phase 1 (MVP) in progress. The **sensor core (M1)** and **crash-capture -> logger (M3)** work β€” `snapshot`/`monitor` read NVIDIA GPU, CPU, memory, and NVMe live, and -> `record` captures a crash-safe log with a post-crash report. A desktop GUI (M10) is also -> up. Health report (M4) is next. See `docs/ROADMAP.md`. +> **Status:** 🟒 Phase 1 (MVP) complete. The **sensor core (M1)**, **crash-capture logger +> (M3)**, and **health report (M4)** all work β€” live `snapshot`/`monitor`, crash-safe `record` +> with a post-crash report, and `report` to scan logs/SMART/driver for likely causes. A +> desktop GUI (M10) ties them together (dashboard, recording, health). See `docs/ROADMAP.md`. ## Why this exists @@ -104,8 +104,8 @@ rigdoctor gui # or: rigdoctor-gui It opens a dark-themed window with sidebar navigation and a **live dashboard** over the same sensor core β€” circular gauges for the headline metrics plus collapsible per-subsystem cards (GPU/CPU/memory/storage) with temperature-colored values (icey-blue β†’ green β†’ red). -The **Logs** section is a full recording page (start/stop, live status, and the post-crash -report); Health / Inventory are placeholders until M4 / M5 land. +The **Logs** and **Health** sections are full pages (recording controls + post-crash report; +and the kernel-log / SMART / driver scan). **Inventory** is a placeholder until M5 lands. Without the GUI extra, `pip install -e .` gives just the stdlib-only CLI. diff --git a/docs/MODULES.md b/docs/MODULES.md index 223aad2..26869a7 100644 --- a/docs/MODULES.md +++ b/docs/MODULES.md @@ -10,16 +10,16 @@ Status: ⬜ not started Β· 🟦 designing Β· 🟨 in progress Β· βœ… done |----|--------|--------|----------|-----------|----------|--------| | M1 | Sensor core | Essential | none (nvidia-smi, sysfs) | all (NVIDIA first) | P0 | ⬜ | | M3 | Crash-capture logger | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | 🟨 | -| M4 | Health report (log scan) | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | ⬜ | +| M4 | Health report (log scan) | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | 🟨 | | M2 | Live monitor (TUI) | Monitoring | none (stdlib curses) | all | P1 | ⬜ | | M8 | Alerting | Monitoring | libnotify (opt) | all | P2 | ⬜ | | M5 | System inventory | Diagnostics | none (opt: lm-sensors, dmidecode) | all | P1 | ⬜ | | M6 | Gaming env checks | Diagnostics | none | all | P2 | ⬜ | | M10 | Desktop GUI | Desktop UI | **python3-pyside6** | all | P2 | 🟨 | | M11 | Tray / menu-bar applet | Desktop UI | **python3-pyside6** (+ AppIndicator on GNOME) | all | P2 | ⬜ | -| M9 | Installer | (meta) | none | all | P1 | ⬜ | +| M9 | Installer | (meta) | none | all | P1 | 🟨 | | M12 | Session sharing / remote assist | Sharing | none (Tier 3: tmate/sshx) | all | P3 | ⬜ | -| M13 | Auto-update | (core) | none (stdlib; user-local file swap) | all | P3 | ⬜ | +| M13 | Auto-update | (core) | none (stdlib; user-local file swap) | all | P3 | 🟨 | | ~~M7~~ | ~~Stress / repro~~ | β€” | β€” | β€” | β€” | ❌ dropped (D7) | ## Notes per module @@ -36,7 +36,9 @@ Status: ⬜ not started Β· 🟦 designing Β· 🟨 in progress Β· βœ… done - **M4 Health report** β€” turns scattered logs into a prioritized, plain-language findings list with **suggested** fixes (read-only, D9). Reuses M1 for a live snapshot. Also powers the **guided diagnostic session** (with M3): pick a game β†’ focused capture β†’ scan β†’ - findings (see SPEC Β§4). + findings (see SPEC Β§4). *Implemented:* journalctl scan (Xid/panic/OOM/MCE/AER/thermal/amdgpu), + SMART, NVIDIA driver-mismatch, journald-persistence + live-temp checks; `rigdoctor report` + (text/JSON) + GUI Health tab. GPU-firmware verification deferred. - **M2 Live monitor** β€” depends on M1; the terminal "HWMonitor for Linux" face. Stdlib-only. - **M5 / M6 Diagnostics** β€” inventory export + gaming-env checks; M6 flags risky settings and suggests the fix command but does not apply it (D9). @@ -52,14 +54,21 @@ Status: ⬜ not started Β· 🟦 designing Β· 🟨 in progress Β· βœ… done action (the guided diagnostic session), plus Open dashboard / Start-Stop recording / Snapshot / Quit (D13). Optional; shares the Qt dependency with M10. - **M9 Installer** β€” interactive wizard layered on the `.deb` (D8); apt-first dependency - resolution; enables the logger service and trigger mode. + resolution; enables the logger service and trigger mode. *Implemented (first cut):* distro/ + package-manager/GPU detection (`core/sysenv`), an optional-component catalog (`core/catalog`), + and dependency install via pkexec/sudo β€” `rigdoctor install [--check] [-y]` + GUI Setup tab. + *Pending:* writing config/module selection and enabling the `systemd --user` service. - **M12 Session sharing / remote assist** (D16) β€” let a helper inspect a user's machine, in an escalating ladder: (1) **diagnostic bundle export** (inventory + recent log + report, one-way), (2) **live read-only view** over a user-chosen tunnel (Tailscale/cloudflared/SSH, no hosted relay), (3) **gated interactive terminal** wrapping tmate/sshx (read-only by default; read-write only on explicit consent β€” a deliberate exception to D9). Per-session consent, ephemeral revocable tokens, audit log. -- **M13 Auto-update** (D18) β€” *planned.* On launch, check the public Gitea releases API and +- **M13 Auto-update** (D18) β€” *check half implemented:* on GUI launch, `core/updates` queries + the Gitea releases API and the sidebar shows up-to-date / an "Update to v…" button / "update + check unavailable" (the instance currently requires sign-in for anonymous API calls). The + no-root **self-update** (download β†’ verify β†’ atomic swap β†’ restart) is still pending. + *Original plan:* On launch, check the public Gitea releases API and **self-update a user-local install with no root** (download β†’ verify checksum/signature β†’ atomic symlink swap β†’ restart, incl. the daemon). HTTPS-only, version-check-only (no telemetry), opt-out-able. Surfaced in the GUI; `rigdoctor update` in the CLI. (`.deb` users diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 8b44d38..2ec5ae0 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -15,8 +15,8 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`). - [x] M3 crash-capture logger (JSONL, fsync per sample, GPU-lost detection, size rotation) - [x] Manual trigger mode (`rigdoctor record run/start/stop/status`); `systemd --user` service + other trigger modes in Phase 4 (`run` is already the service entrypoint) -- [ ] M4 health report (Xid/panic/OOM/MCE/AER/thermal scan + driver-mismatch + snapshot, - suggested fixes only β€” D9) +- [x] M4 health report (Xid/panic/OOM/MCE/AER/thermal scan + SMART + driver-mismatch + + journald-persistence + live temps, suggested fixes only β€” D9; GPU-firmware verify deferred) - [x] `record report` post-crash summary (peak temps/power per subsystem, events, last N samples) - **Exit criteria:** user can run it during gaming and, after a freeze/black-screen, see the last readings + a plausible cause. @@ -39,15 +39,17 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`). - [ ] Logger trigger modes: always-on + game-launch (D12 β€” wrapper first: `rigdoctor wrap %command%` + global Steam compat-tool; zero-config watcher (Steam RunningAppID + /proc) and GameMode hook follow) -- [ ] M9 interactive installer (GPU detection, module menu, apt dependency resolution, - service enable + trigger-mode pick) +- [~] M9 interactive installer β€” *done:* distro/GPU detection + optional-dependency install + (`rigdoctor install`, GUI Setup tab). *Pending:* module-selection config + `systemd --user` + service enable + trigger-mode pick. - [ ] `.deb` packaging (D8) declaring per-bundle deps incl. python3-pyside6 for Desktop UI ## Phase 5 β€” Breadth (later) - [ ] AMD GPU support in M1 (Steam Deck / Radeon) - [ ] Intel GPU best-effort -- [ ] M13 auto-update (D18) β€” launch-time version check + no-root self-update of the - user-local install from the public Gitea releases; GUI prompt + `rigdoctor update` +- [~] M13 auto-update (D18) β€” *done:* launch-time version check shown in the GUI sidebar + (up-to-date / "Update to v…" / unavailable). *Pending:* no-root self-update of the + user-local install from the public Gitea releases; `rigdoctor update`. - [ ] (Later, separate milestone) Optional auto-apply of suggested fixes behind explicit consent β€” currently out of scope (D9) diff --git a/pyproject.toml b/pyproject.toml index b7988fb..05d9cfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "rigdoctor" -version = "0.0.3" +version = "0.0.5" description = "Modular hardware monitoring & crash diagnostics for Linux gamers." readme = "README.md" requires-python = ">=3.11" diff --git a/src/rigdoctor/__init__.py b/src/rigdoctor/__init__.py index 22dbb0e..6260f4a 100644 --- a/src/rigdoctor/__init__.py +++ b/src/rigdoctor/__init__.py @@ -1,3 +1,3 @@ """RigDoctor β€” modular hardware monitoring & crash diagnostics for Linux gamers.""" -__version__ = "0.0.3" +__version__ = "0.0.5" diff --git a/src/rigdoctor/cli.py b/src/rigdoctor/cli.py index 05e1c06..1dac7af 100644 --- a/src/rigdoctor/cli.py +++ b/src/rigdoctor/cli.py @@ -164,9 +164,66 @@ def cmd_record_report(args) -> int: return 0 +def cmd_install(args) -> int: + from .core import installer, sysenv + + print(f"Distro: {sysenv.distro_name()}") + pm = sysenv.package_manager() + print(f"Package manager: {pm or 'none (only apt is supported)'}") + print(f"GPU: {', '.join(sysenv.gpu_vendors()) or 'unknown'}\n") + + status = installer.component_status() + print("Optional components:") + for component, present in status: + mark = "βœ“" if present else "βœ—" + print(f" [{mark}] {component.name:<22} β€” {component.enables}") + if not present: + print(f" apt: {' '.join(component.apt)}") + + missing = [c for c, present in status if not present] + if not missing: + print("\nAll optional components are installed. βœ”") + return 0 + + packages = installer.missing_packages(missing) + print(f"\nMissing packages: {' '.join(packages)}") + if args.check: + return 0 + if pm != "apt": + print(f"Automatic install needs apt. Install manually:\n sudo apt install {' '.join(packages)}") + return 1 + if not args.yes: + try: + reply = input(f"\nInstall {len(packages)} package(s) now? [y/N] ").strip().lower() + except EOFError: + reply = "n" + if reply not in ("y", "yes"): + print("Aborted.") + return 1 + + print("Installing (you may be prompted for your password)…") + rc, out = installer.install_packages(packages) + print(out[-2000:]) + if rc == 0: + still = [c.name for c, present in installer.component_status() if not present] + print("\nStill missing: " + (", ".join(still) if still else "none βœ”")) + else: + print(f"\nInstall failed (exit {rc}).") + return rc + + def cmd_report(args) -> int: - print("`report` (M4 health report) is not implemented yet β€” next on the roadmap.") - return 2 + from dataclasses import asdict + + from .core.health import run_health_checks + from .render import render_health + + findings = run_health_checks() + if args.json: + print(json.dumps([asdict(f) for f in findings], indent=2, ensure_ascii=False)) + else: + print(render_health(findings)) + return 0 def build_parser() -> argparse.ArgumentParser: @@ -188,6 +245,11 @@ def build_parser() -> argparse.ArgumentParser: sub.add_parser("gui", help="launch the desktop GUI (needs PySide6)").set_defaults(func=cmd_gui) sub.add_parser("sources", help="list detected sensor sources").set_defaults(func=cmd_sources) + inst = sub.add_parser("install", help="set up optional system dependencies (M9)") + inst.add_argument("--check", action="store_true", help="report status only; install nothing") + inst.add_argument("-y", "--yes", action="store_true", help="install without confirmation") + inst.set_defaults(func=cmd_install) + rec = sub.add_parser("record", help="crash-capture logger (M3)") rec_sub = rec.add_subparsers(dest="record_cmd", required=True) @@ -209,7 +271,9 @@ def build_parser() -> argparse.ArgumentParser: report_p.add_argument("--log", default=None, help="path to a capture log") report_p.set_defaults(func=cmd_record_report) - sub.add_parser("report", help="health report (coming soon)").set_defaults(func=cmd_report) + rep = sub.add_parser("report", help="health report (M4): scan logs/SMART/driver for issues") + rep.add_argument("--json", action="store_true", help="output JSON instead of text") + rep.set_defaults(func=cmd_report) return p diff --git a/src/rigdoctor/core/catalog.py b/src/rigdoctor/core/catalog.py new file mode 100644 index 0000000..b82ae8e --- /dev/null +++ b/src/rigdoctor/core/catalog.py @@ -0,0 +1,44 @@ +"""Installable component catalog (M9): optional system tools and what they enable. + +apt-only (D15). Core monitoring (M1/M3/M4) needs no packages β€” these are optional +enrichments the installer can add. Each component is detected by a representative +command (present == usable). +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Component: + id: str + name: str + bundle: str + enables: str # capability unlocked when present + apt: tuple[str, ...] # apt package name(s) + command: str # command used to detect presence + + +COMPONENTS: tuple[Component, ...] = ( + Component( + "smartmontools", "SMART disk health", "Diagnostics", + "Disk health (SMART) in the health report (M4)", ("smartmontools",), "smartctl", + ), + Component( + "lm-sensors", "lm-sensors", "Diagnostics", + "Extra motherboard / voltage sensors", ("lm-sensors",), "sensors", + ), + Component( + "dmidecode", "dmidecode", "Diagnostics", + "Motherboard / BIOS / RAM details for system inventory (M5)", ("dmidecode",), "dmidecode", + ), + Component( + "pciutils", "pciutils", "Diagnostics", + "PCIe topology + GPU detection (lspci)", ("pciutils",), "lspci", + ), + Component( + "libnotify", "Desktop notifications", "Monitoring", + "Desktop alert notifications (M8)", ("libnotify-bin",), "notify-send", + ), +) diff --git a/src/rigdoctor/core/health.py b/src/rigdoctor/core/health.py new file mode 100644 index 0000000..c2758fd --- /dev/null +++ b/src/rigdoctor/core/health.py @@ -0,0 +1,245 @@ +"""Health report (M4): scan kernel logs + SMART + driver/library state into a +prioritized, plain-language findings list with suggested fixes (read-only, D9). + +Stdlib-only. Every check degrades gracefully β€” a missing tool/permission yields an +info finding, never an exception. +""" + +from __future__ import annotations + +import re +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + +CRITICAL = "critical" +WARNING = "warning" +INFO = "info" +OK = "ok" +_ORDER = {CRITICAL: 0, WARNING: 1, INFO: 2, OK: 3} + + +@dataclass +class Finding: + severity: str # critical | warning | info | ok + category: str # GPU, Kernel, Memory, Storage, Thermal, Driver, PCIe, Logs + title: str + detail: str = "" + suggestion: str = "" + + +# --- NVIDIA Xid knowledge (the seed crash is Xid 79) -------------------------- +_XID_INFO: dict[int, tuple[str, str]] = { + 13: (WARNING, "Graphics engine exception (often an app/driver bug or unstable overclock)"), + 31: (WARNING, "GPU memory page fault (usually a driver or application bug)"), + 43: (WARNING, "GPU stopped processing a task (application error)"), + 45: (INFO, "Preemptive channel removal (often a side-effect of another error or a reboot)"), + 48: (CRITICAL, "Double-bit ECC error β€” VRAM hardware fault"), + 62: (CRITICAL, "Internal microcontroller halt (often follows instability)"), + 79: (CRITICAL, "GPU has fallen off the bus β€” hardware: power delivery, PCIe link, or thermals"), + 94: (CRITICAL, "Contained ECC error"), + 95: (CRITICAL, "Uncontained ECC error"), + 119: (CRITICAL, "GSP RPC timeout β€” GPU System Processor hang"), + 120: (CRITICAL, "GSP error β€” GPU System Processor fault"), +} +_XID_SUGGEST: dict[int, str] = { + 79: "Check PSU/power cables and reseat the GPU/riser; test a lower power limit " + "(`sudo nvidia-smi -pl `) and capture a session with `rigdoctor record`.", + 48: "Persistent VRAM ECC errors mean failing memory β€” RMA the card if it recurs.", + 119: "GSP hangs are often driver-version specific β€” try a different driver branch.", + 120: "GSP errors are often driver-version specific β€” try a different driver branch.", +} +_XID_RE = re.compile(r"Xid(?:\s*\([^)]*\))?:?\s*(\d+)") + + +def scan_journal_text(text: str) -> list[Finding]: + """Parse kernel-log text into findings (separated from IO so it's testable).""" + lines = text.splitlines() + findings: list[Finding] = [] + + xids: dict[int, int] = {} + for line in lines: + if "Xid" in line: + m = _XID_RE.search(line) + if m: + code = int(m.group(1)) + xids[code] = xids.get(code, 0) + 1 + for code in sorted(xids): + severity, desc = _XID_INFO.get(code, (WARNING, f"NVIDIA GPU error (Xid {code})")) + suggest = _XID_SUGGEST.get(code, "Look up this Xid code in NVIDIA's Xid error documentation.") + findings.append(Finding(severity, "GPU", f"NVIDIA Xid {code} Γ—{xids[code]}", desc, suggest)) + + oom = sum(1 for ln in lines if "Out of memory" in ln or "oom-kill" in ln or "oom_reaper" in ln) + if oom: + findings.append(Finding( + WARNING, "Memory", f"Out-of-memory kills Γ—{oom}", + "The kernel killed processes to reclaim RAM.", + "Close memory-heavy apps, add zram/swap, or investigate a leak.", + )) + + if any("Kernel panic" in ln for ln in lines): + findings.append(Finding( + CRITICAL, "Kernel", "Kernel panic recorded", + "The kernel hit an unrecoverable error.", + "Note the panic message; review recent driver/kernel updates and hardware.", + )) + + if any("mce:" in ln or "Machine check" in ln or "Hardware Error" in ln for ln in lines): + findings.append(Finding( + CRITICAL, "Hardware", "Machine Check Exception (MCE)", + "The CPU reported a hardware error.", + "Run memtest86 for RAM, check CPU temps/voltages, and review the MCE detail.", + )) + + if any("AER:" in ln or "PCIe Bus Error" in ln or ("pcieport" in ln and "error" in ln.lower()) for ln in lines): + findings.append(Finding( + WARNING, "PCIe", "PCIe bus errors (AER)", + "Correctable/uncorrectable PCIe errors were logged.", + "Reseat the device and check risers/cabling; AER storms can precede a GPU drop.", + )) + + low = [ln.lower() for ln in lines] + if any(("thermal" in ln and ("critical" in ln or "throttl" in ln)) or "temperature above threshold" in ln for ln in low): + findings.append(Finding( + WARNING, "Thermal", "Thermal events logged", + "The system logged thermal throttling / critical-temperature events.", + "Improve airflow/cooling and check fan curves; watch live temps on the dashboard.", + )) + + if any("amdgpu" in ln and "reset" in ln for ln in low): + findings.append(Finding( + CRITICAL, "GPU", "AMD GPU reset (amdgpu)", + "The AMD GPU was reset after a hang.", + "Check power/thermals/driver; capture a session with `rigdoctor record`.", + )) + + return findings + + +def _journalctl(args: list[str]) -> str | None: + if shutil.which("journalctl") is None: + return None + try: + proc = subprocess.run(["journalctl", *args], capture_output=True, text=True, timeout=25) + return proc.stdout + except (subprocess.SubprocessError, OSError): + return None + + +def check_journal() -> list[Finding]: + out = _journalctl(["-k", "--no-pager", "-o", "cat", "--since", "-7 days"]) + if out is None: + return [Finding( + INFO, "Logs", "Couldn't read the kernel journal", + "journalctl is unavailable or not readable.", + "Ensure systemd/journald is present and your user is in the 'systemd-journal' or 'adm' group.", + )] + findings = scan_journal_text(out) + if not findings: + findings.append(Finding( + OK, "Logs", "No notable kernel errors (last 7 days)", + "No Xid, panic, OOM, MCE, PCIe AER, or thermal events found.", + )) + return findings + + +def check_journal_persistence() -> list[Finding]: + if Path("/var/log/journal").is_dir(): + return [] + return [Finding( + WARNING, "Logs", "journald isn't persistent across reboots", + "Crash-boot kernel logs are discarded on reboot, so a hard freeze's evidence can vanish.", + "Enable persistent logging: `sudo mkdir -p /var/log/journal && sudo systemctl restart systemd-journald`", + )] + + +def check_nvidia_driver() -> list[Finding]: + if shutil.which("nvidia-smi") is None: + return [] + try: + proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, timeout=10) + except (subprocess.SubprocessError, OSError): + return [] + if "Driver/library version mismatch" in (proc.stdout + proc.stderr): + return [Finding( + CRITICAL, "Driver", "NVIDIA driver/library version mismatch", + "The loaded kernel module and the userspace NVIDIA libraries differ β€” GPU monitoring will fail until resolved.", + "Reboot to load the matching module (or finish the interrupted driver update).", + )] + return [] + + +def _smart_devices() -> list[str]: + try: + proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10) + except (subprocess.SubprocessError, OSError): + return [] + devices = [] + for line in proc.stdout.splitlines(): + line = line.strip() + if line.startswith("/dev/"): + devices.append(line.split()[0]) + return devices + + +def check_smart() -> list[Finding]: + if shutil.which("smartctl") is None: + return [Finding( + INFO, "Storage", "SMART not checked (smartmontools missing)", + "Disk self-health couldn't be read.", + "Install it for disk health checks: `sudo apt install smartmontools`", + )] + devices = _smart_devices() + if not devices: + return [Finding( + INFO, "Storage", "SMART: couldn't enumerate drives", + "Reading SMART usually needs root.", + "Run: `sudo rigdoctor report`", + )] + findings: list[Finding] = [] + for dev in devices: + try: + proc = subprocess.run(["smartctl", "-H", dev], capture_output=True, text=True, timeout=15) + except (subprocess.SubprocessError, OSError): + continue + combined = proc.stdout + proc.stderr + if "Permission denied" in combined or "requires root" in combined.lower(): + findings.append(Finding(INFO, "Storage", f"SMART for {dev} needs root", "", "Run: `sudo rigdoctor report`")) + elif "PASSED" in combined: + findings.append(Finding(OK, "Storage", f"SMART OK: {dev}", "Overall-health self-assessment passed.")) + elif "FAILED" in combined or "FAILING_NOW" in combined: + findings.append(Finding(CRITICAL, "Storage", f"SMART FAILED: {dev}", "The drive reports failing health.", "Back up now and replace the drive.")) + return findings + + +def check_live_temps() -> list[Finding]: + from .sampler import Sampler + from .sources import available_sources + + sample = Sampler(available_sources()).sample() + hot = [ + (r.source, r.label or r.metric, r.value) + for r in sample.readings + if r.unit == "Β°C" and r.value is not None and r.value >= 90 + ] + if not hot: + return [] + worst = max(hot, key=lambda x: x[2]) + detail = "; ".join(f"{s} {label} {v:.0f}Β°C" for s, label, v in hot) + return [Finding( + WARNING, "Thermal", f"High temperature right now ({worst[2]:.0f}Β°C)", + detail, "Check cooling/airflow and reduce load.", + )] + + +def run_health_checks() -> list[Finding]: + """Run all checks and return findings sorted by severity (worst first).""" + findings: list[Finding] = [] + findings += check_nvidia_driver() + findings += check_journal() + findings += check_journal_persistence() + findings += check_smart() + findings += check_live_temps() + findings.sort(key=lambda f: _ORDER.get(f.severity, 9)) + return findings diff --git a/src/rigdoctor/core/installer.py b/src/rigdoctor/core/installer.py new file mode 100644 index 0000000..4243127 --- /dev/null +++ b/src/rigdoctor/core/installer.py @@ -0,0 +1,58 @@ +"""Optional-dependency installer (M9): figure out what's missing and install it. + +apt-only (D15). Installs run via pkexec/sudo so a normal user gets a single auth +prompt; nothing is installed without an explicit confirmation by the caller. +""" + +from __future__ import annotations + +import os +import shlex +import shutil +import subprocess +from collections.abc import Callable + +from . import sysenv +from .catalog import COMPONENTS, Component + + +def component_status(present: Callable[[str], bool] | None = None) -> list[tuple[Component, bool]]: + """Pair each catalog component with whether it's installed (command present).""" + present = present or sysenv.has_command + return [(c, present(c.command)) for c in COMPONENTS] + + +def missing_packages(components: list[Component]) -> list[str]: + """De-duplicated apt package list for the given components, order preserved.""" + packages: list[str] = [] + for component in components: + for pkg in component.apt: + if pkg not in packages: + packages.append(pkg) + return packages + + +def apt_install_command(packages: list[str]) -> list[str]: + """Build an `apt-get update && install` command, elevated if we're not root.""" + inner = "apt-get update && apt-get install -y " + " ".join(shlex.quote(p) for p in packages) + cmd = ["/bin/sh", "-c", inner] + if os.geteuid() == 0: + return cmd + if shutil.which("pkexec"): + return ["pkexec", *cmd] + if shutil.which("sudo"): + return ["sudo", *cmd] + return cmd # no privilege escalation available β€” will likely fail, surfaced to the caller + + +def install_packages(packages: list[str]) -> tuple[int, str]: + """Install the given packages. Returns (exit_code, combined_output).""" + if not packages: + return (0, "Nothing to install.") + try: + proc = subprocess.run( + apt_install_command(packages), capture_output=True, text=True, timeout=900 + ) + return (proc.returncode, proc.stdout + proc.stderr) + except (subprocess.SubprocessError, OSError) as exc: + return (1, str(exc)) diff --git a/src/rigdoctor/core/sysenv.py b/src/rigdoctor/core/sysenv.py new file mode 100644 index 0000000..222d6e7 --- /dev/null +++ b/src/rigdoctor/core/sysenv.py @@ -0,0 +1,49 @@ +"""Environment detection for the installer (M9).""" + +from __future__ import annotations + +import shutil +import subprocess + + +def package_manager() -> str | None: + """Only apt is supported (D15); return 'apt' if present, else None.""" + if shutil.which("apt-get") or shutil.which("apt"): + return "apt" + return None + + +def has_command(cmd: str) -> bool: + return shutil.which(cmd) is not None + + +def distro_name() -> str: + try: + data: dict[str, str] = {} + with open("/etc/os-release") as f: + for line in f: + key, _, value = line.partition("=") + data[key.strip()] = value.strip().strip('"') + return data.get("PRETTY_NAME") or data.get("NAME") or "Linux" + except OSError: + return "Linux" + + +def gpu_vendors() -> list[str]: + vendors: list[str] = [] + if shutil.which("nvidia-smi"): + vendors.append("NVIDIA") + out = "" + if shutil.which("lspci"): + try: + out = subprocess.run(["lspci"], capture_output=True, text=True, timeout=10).stdout + except (subprocess.SubprocessError, OSError): + out = "" + low = out.lower() + if "nvidia" in low and "NVIDIA" not in vendors: + vendors.append("NVIDIA") + if ("amd/ati" in low or "advanced micro devices" in low or "radeon" in low) and "AMD" not in vendors: + vendors.append("AMD") + if "intel" in low and any(k in low for k in ("vga", "display", "graphics")) and "Intel" not in vendors: + vendors.append("Intel") + return vendors diff --git a/src/rigdoctor/core/updates.py b/src/rigdoctor/core/updates.py new file mode 100644 index 0000000..9ac0f55 --- /dev/null +++ b/src/rigdoctor/core/updates.py @@ -0,0 +1,41 @@ +"""Update check (M13, check half): ask the Gitea releases API for the latest version. + +Stdlib-only (urllib). Self-update isn't built yet; this only *detects* a newer +release. Any failure (network, or the instance requiring sign-in for the API) +returns None so callers can degrade gracefully. +""" + +from __future__ import annotations + +import json +import urllib.request + +from .. import __version__ + +GITEA_BASE = "https://git.jesseyvanofferen.com" +REPO = "jessey/rigdoctor" +LATEST_API = f"{GITEA_BASE}/api/v1/repos/{REPO}/releases/latest" +RELEASES_PAGE = f"{GITEA_BASE}/{REPO}/releases" + + +def _parse(version: str) -> tuple[int, ...]: + return tuple(int(p) for p in version.lstrip("vV").split(".") if p.isdigit()) + + +def is_newer(latest: str, current: str = __version__) -> bool: + try: + return _parse(latest) > _parse(current) + except (ValueError, AttributeError): + return False + + +def check_latest(timeout: float = 4.0) -> str | None: + """Return the latest release tag (e.g. 'v0.0.5'), or None if it can't be determined.""" + try: + req = urllib.request.Request(LATEST_API, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 (https only) + data = json.load(resp) + tag = data.get("tag_name") + return tag or None + except Exception: + return None diff --git a/src/rigdoctor/gui/health_page.py b/src/rigdoctor/gui/health_page.py new file mode 100644 index 0000000..ec02e9b --- /dev/null +++ b/src/rigdoctor/gui/health_page.py @@ -0,0 +1,125 @@ +"""Health page (M4 in the GUI): runs the health checks and shows findings as cards.""" + +from __future__ import annotations + +import threading +import time + +from PySide6.QtCore import Qt, QTimer, Signal +from PySide6.QtWidgets import ( + QFrame, + QHBoxLayout, + QLabel, + QPushButton, + QScrollArea, + QVBoxLayout, + QWidget, +) + +from .theme import ACCENT, CRIT, GOOD, MUTED, WARN + +_SEV = { + "critical": ("CRITICAL", CRIT), + "warning": ("WARNING", WARN), + "info": ("INFO", MUTED), + "ok": ("OK", GOOD), +} + + +def _finding_widget(finding) -> QFrame: + label, color = _SEV.get(finding.severity, ("?", MUTED)) + card = QFrame() + card.setObjectName("Card") + v = QVBoxLayout(card) + v.setContentsMargins(16, 12, 16, 12) + v.setSpacing(4) + + head = QLabel(f"{label} Β· {finding.category}: {finding.title}") + head.setStyleSheet(f"color: {color}; font-weight: 700; background: transparent;") + head.setWordWrap(True) + v.addWidget(head) + + if finding.detail: + detail = QLabel(finding.detail) + detail.setObjectName("Muted") + detail.setWordWrap(True) + v.addWidget(detail) + if finding.suggestion: + suggestion = QLabel(f"β†’ {finding.suggestion}") + suggestion.setStyleSheet(f"color: {ACCENT}; background: transparent;") + suggestion.setWordWrap(True) + v.addWidget(suggestion) + return card + + +class HealthPage(QWidget): + _result = Signal(object) # list[Finding] + + def __init__(self) -> None: + super().__init__() + self.setObjectName("Page") + self._result.connect(self._render_findings) + + root = QVBoxLayout(self) + root.setContentsMargins(20, 18, 20, 18) + root.setSpacing(16) + + header = QHBoxLayout() + title = QLabel("Health") + title.setObjectName("PageTitle") + header.addWidget(title) + header.addStretch(1) + self._status = QLabel("") + self._status.setObjectName("Muted") + header.addWidget(self._status) + self._run_btn = QPushButton("Run health report") + self._run_btn.setObjectName("PrimaryButton") + self._run_btn.clicked.connect(self._run) + header.addWidget(self._run_btn) + root.addLayout(header) + + scroll = QScrollArea() + scroll.setWidgetResizable(True) + scroll.setFrameShape(QFrame.Shape.NoFrame) + scroll.setStyleSheet("background: transparent;") + self._container = QWidget() + self._list = QVBoxLayout(self._container) + self._list.setContentsMargins(0, 0, 0, 0) + self._list.setSpacing(10) + self._list.setAlignment(Qt.AlignmentFlag.AlignTop) + scroll.setWidget(self._container) + root.addWidget(scroll, 1) + + QTimer.singleShot(300, self._run) # auto-run shortly after the window opens + + def _run(self) -> None: + self._run_btn.setEnabled(False) + self._status.setText("Scanning logs, SMART, and driver…") + threading.Thread(target=self._work, daemon=True).start() + + def _work(self) -> None: + from ..core.health import run_health_checks + + try: + findings = run_health_checks() + except Exception: + findings = [] + self._result.emit(findings) + + def _render_findings(self, findings) -> None: + while self._list.count(): + item = self._list.takeAt(0) + w = item.widget() + if w is not None: + w.deleteLater() + + crit = sum(1 for f in findings if f.severity == "critical") + warn = sum(1 for f in findings if f.severity == "warning") + self._status.setText( + f"{crit} critical Β· {warn} warning Β· {len(findings)} checks Β· " + f"{time.strftime('%H:%M:%S')}" + ) + for finding in findings: + self._list.addWidget(_finding_widget(finding)) + self._list.addStretch(1) + self._run_btn.setEnabled(True) diff --git a/src/rigdoctor/gui/main_window.py b/src/rigdoctor/gui/main_window.py index 938a771..cb58be0 100644 --- a/src/rigdoctor/gui/main_window.py +++ b/src/rigdoctor/gui/main_window.py @@ -2,7 +2,10 @@ from __future__ import annotations -from PySide6.QtCore import Qt +import threading + +from PySide6.QtCore import Qt, QUrl, Signal +from PySide6.QtGui import QDesktopServices from PySide6.QtWidgets import ( QButtonGroup, QFrame, @@ -16,19 +19,23 @@ from PySide6.QtWidgets import ( ) from .. import __version__ +from ..core import updates from .dashboard import Dashboard +from .health_page import HealthPage from .recorder_page import RecorderPage -from .theme import ACCENT, MUTED +from .setup_page import SetupPage +from .theme import ACCENT, GOOD, MUTED from .worker import SamplerWorker -_NAV_ITEMS = ["Dashboard", "Logs", "Health", "Inventory"] +_NAV_ITEMS = ["Dashboard", "Logs", "Health", "Setup", "Inventory"] _PLACEHOLDERS = { - "Health": "The health report (M4) β€” log scan + plain-language findings β€” lands here.", "Inventory": "System inventory (M5) β€” CPU/GPU/board/RAM/drivers β€” lands here.", } class MainWindow(QMainWindow): + _update_checked = Signal(object) # latest tag (str) or None + def __init__(self, interval: float = 1.0) -> None: super().__init__() self.setWindowTitle("RigDoctor") @@ -48,10 +55,13 @@ class MainWindow(QMainWindow): self._stack = QStackedWidget() self.dashboard = Dashboard() self.recorder_page = RecorderPage() + self.health_page = HealthPage() + self.setup_page = SetupPage() self._stack.addWidget(self.dashboard) # 0 Dashboard self._stack.addWidget(self.recorder_page) # 1 Logs - self._stack.addWidget(self._placeholder_page("Health", _PLACEHOLDERS["Health"])) # 2 - self._stack.addWidget(self._placeholder_page("Inventory", _PLACEHOLDERS["Inventory"])) # 3 + self._stack.addWidget(self.health_page) # 2 Health + self._stack.addWidget(self.setup_page) # 3 Setup + self._stack.addWidget(self._placeholder_page("Inventory", _PLACEHOLDERS["Inventory"])) # 4 content_layout.addWidget(self._stack) layout.addWidget(self._build_sidebar()) @@ -61,6 +71,10 @@ class MainWindow(QMainWindow): self._worker.sampled.connect(self.dashboard.update_sample) self._worker.start() + # Background update check (M13); result lands in the sidebar. + self._update_checked.connect(self._show_update_state) + threading.Thread(target=self._check_updates, daemon=True).start() + def _build_sidebar(self) -> QFrame: bar = QFrame() bar.setObjectName("Sidebar") @@ -95,8 +109,33 @@ class MainWindow(QMainWindow): version = QLabel(f"v{__version__}") version.setObjectName("Muted") v.addWidget(version) + + # Update state (filled in by the background check). + self._update_label = QLabel("checking for updates…") + self._update_label.setObjectName("Muted") + v.addWidget(self._update_label) + self._update_btn = QPushButton() + self._update_btn.setObjectName("PrimaryButton") + self._update_btn.setCursor(Qt.CursorShape.PointingHandCursor) + self._update_btn.clicked.connect(lambda: QDesktopServices.openUrl(QUrl(updates.RELEASES_PAGE))) + self._update_btn.setVisible(False) + v.addWidget(self._update_btn) return bar + def _check_updates(self) -> None: + self._update_checked.emit(updates.check_latest()) + + def _show_update_state(self, latest) -> None: + if not latest: + self._update_label.setText("update check unavailable") + return + if updates.is_newer(latest, __version__): + self._update_label.setText(f'{latest} available') + self._update_btn.setText(f"Update to {latest}") + self._update_btn.setVisible(True) + else: + self._update_label.setText("up-to-date") + def _placeholder_page(self, title: str, description: str) -> QWidget: page = QWidget() page.setObjectName("Page") diff --git a/src/rigdoctor/gui/setup_page.py b/src/rigdoctor/gui/setup_page.py new file mode 100644 index 0000000..b4ef9ad --- /dev/null +++ b/src/rigdoctor/gui/setup_page.py @@ -0,0 +1,128 @@ +"""Setup page (M9 in the GUI): show environment + optional components, install missing.""" + +from __future__ import annotations + +import threading + +from PySide6.QtCore import Qt, Signal +from PySide6.QtWidgets import ( + QFrame, + QHBoxLayout, + QLabel, + QPushButton, + QSizePolicy, + QTextEdit, + QVBoxLayout, + QWidget, +) + +from ..core import installer, sysenv +from .theme import GOOD, MUTED + + +def _panel(title: str) -> tuple[QFrame, QVBoxLayout]: + frame = QFrame() + frame.setObjectName("Card") + frame.setSizePolicy(QSizePolicy.Policy.Expanding, QSizePolicy.Policy.Maximum) + layout = QVBoxLayout(frame) + layout.setContentsMargins(16, 14, 16, 14) + layout.setSpacing(8) + label = QLabel(title) + label.setStyleSheet("font-weight: 700; background: transparent;") + layout.addWidget(label) + return frame, layout + + +class SetupPage(QWidget): + _installed = Signal(int, str) + + def __init__(self) -> None: + super().__init__() + self.setObjectName("Page") + self._installed.connect(self._on_installed) + + root = QVBoxLayout(self) + root.setContentsMargins(20, 18, 20, 18) + root.setSpacing(16) + + title = QLabel("Setup") + title.setObjectName("PageTitle") + root.addWidget(title) + + env_card, env_layout = _panel("Environment") + self._env = QLabel("") + self._env.setObjectName("Muted") + env_layout.addWidget(self._env) + root.addWidget(env_card) + + comp_card, comp_layout = _panel("Optional components") + self._components = QVBoxLayout() + self._components.setSpacing(6) + comp_layout.addLayout(self._components) + controls = QHBoxLayout() + self._install_btn = QPushButton("Install missing") + self._install_btn.setObjectName("PrimaryButton") + self._install_btn.clicked.connect(self._install) + self._refresh_btn = QPushButton("Re-check") + self._refresh_btn.clicked.connect(self._refresh) + controls.addWidget(self._install_btn) + controls.addWidget(self._refresh_btn) + controls.addStretch(1) + comp_layout.addLayout(controls) + root.addWidget(comp_card) + + self._output = QTextEdit() + self._output.setObjectName("Report") + self._output.setReadOnly(True) + self._output.setMinimumHeight(180) + self._output.setVisible(False) + root.addWidget(self._output) + root.addStretch(1) + + self._refresh() + + def _refresh(self) -> None: + self._env.setText( + f"Distro: {sysenv.distro_name()} " + f"Package manager: {sysenv.package_manager() or 'none (apt required)'} " + f"GPU: {', '.join(sysenv.gpu_vendors()) or 'unknown'}" + ) + while self._components.count(): + item = self._components.takeAt(0) + w = item.widget() + if w is not None: + w.deleteLater() + + status = installer.component_status() + for component, present in status: + mark = "βœ“" if present else "βœ—" + color = GOOD if present else MUTED + row = QLabel(f"[{mark}] " + f"{component.name} β€” {component.enables}") + row.setTextFormat(Qt.TextFormat.RichText) + row.setWordWrap(True) + self._components.addWidget(row) + + self._missing = [c for c, present in status if not present] + self._install_btn.setEnabled(bool(self._missing) and sysenv.package_manager() == "apt") + if not self._missing: + self._install_btn.setText("All installed βœ”") + + def _install(self) -> None: + packages = installer.missing_packages(self._missing) + if not packages: + return + self._install_btn.setEnabled(False) + self._install_btn.setText("Installing… (may prompt for password)") + self._output.setVisible(True) + self._output.setPlainText(f"Installing: {' '.join(packages)}\n") + threading.Thread(target=self._work, args=(packages,), daemon=True).start() + + def _work(self, packages: list[str]) -> None: + rc, out = installer.install_packages(packages) + self._installed.emit(rc, out) + + def _on_installed(self, rc: int, out: str) -> None: + self._output.setPlainText(out[-4000:]) + self._install_btn.setText("Install missing") + self._refresh() diff --git a/src/rigdoctor/render.py b/src/rigdoctor/render.py index c37776c..691e5cf 100644 --- a/src/rigdoctor/render.py +++ b/src/rigdoctor/render.py @@ -99,6 +99,25 @@ def _aggregate_peaks(maxima: dict) -> list[tuple[str, str, float, str, float, st return rows +_SEV_LABEL = {"critical": "CRITICAL", "warning": "WARNING", "info": "INFO", "ok": "OK"} + + +def render_health(findings: list) -> str: + if not findings: + return "Health report: no findings." + crit = sum(1 for f in findings if f.severity == "critical") + warn = sum(1 for f in findings if f.severity == "warning") + lines = ["Health report", "", f" {crit} critical Β· {warn} warning Β· {len(findings)} checks", ""] + for f in findings: + lines.append(f"[{_SEV_LABEL.get(f.severity, '?')}] {f.category}: {f.title}") + if f.detail: + lines.append(f" {f.detail}") + if f.suggestion: + lines.append(f" β†’ {f.suggestion}") + lines.append("") + return "\n".join(lines).rstrip() + + def render_summary(summary: Summary, log_path=None) -> str: if summary.samples == 0 and not summary.events: where = f" ({log_path})" if log_path else "" diff --git a/tests/test_health.py b/tests/test_health.py new file mode 100644 index 0000000..edf6e76 --- /dev/null +++ b/tests/test_health.py @@ -0,0 +1,46 @@ +"""Tests for the M4 health report's log scanner (synthetic input).""" + +import unittest + +from rigdoctor.core.health import CRITICAL, WARNING, run_health_checks, scan_journal_text + + +class HealthScanTests(unittest.TestCase): + def test_xid_79_is_critical(self): + text = "NVRM: Xid (PCI:0000:01:00): 79, pid=1234, GPU has fallen off the bus." + findings = scan_journal_text(text) + gpu = [f for f in findings if f.category == "GPU"] + self.assertEqual(len(gpu), 1) + self.assertIn("79", gpu[0].title) + self.assertEqual(gpu[0].severity, CRITICAL) + + def test_xid_count_aggregates(self): + text = "\n".join(["NVRM: Xid (PCI:0000:01:00): 79, foo"] * 3) + gpu = [f for f in scan_journal_text(text) if f.category == "GPU"][0] + self.assertIn("Γ—3", gpu.title) + + def test_oom_and_panic_detected(self): + text = "Out of memory: Killed process 999 (game)\nKernel panic - not syncing: x" + cats = {f.category for f in scan_journal_text(text)} + self.assertIn("Memory", cats) + self.assertIn("Kernel", cats) + + def test_mce_critical(self): + findings = scan_journal_text("mce: [Hardware Error]: Machine check events logged") + self.assertTrue(any(f.severity == CRITICAL and f.category == "Hardware" for f in findings)) + + def test_clean_text_yields_no_findings(self): + self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), []) + + def test_run_health_checks_returns_findings(self): + # Runs against the real system; just assert it returns a sorted list of Findings. + findings = run_health_checks() + self.assertIsInstance(findings, list) + severities = [f.severity for f in findings] + order = {"critical": 0, "warning": 1, "info": 2, "ok": 3} + ranks = [order.get(s, 9) for s in severities] + self.assertEqual(ranks, sorted(ranks)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_installer.py b/tests/test_installer.py new file mode 100644 index 0000000..15a9c50 --- /dev/null +++ b/tests/test_installer.py @@ -0,0 +1,46 @@ +"""Tests for the M9 installer logic and the M13 version comparison.""" + +import unittest + +from rigdoctor.core import installer +from rigdoctor.core.catalog import Component +from rigdoctor.core.updates import is_newer + + +class InstallerTests(unittest.TestCase): + def test_component_status_uses_presence(self): + status = installer.component_status(present=lambda cmd: cmd == "smartctl") + by_id = {c.id: ok for c, ok in status} + self.assertTrue(by_id["smartmontools"]) + self.assertFalse(by_id["dmidecode"]) + + def test_missing_packages_dedup_preserves_order(self): + comps = [ + Component("a", "A", "B", "x", ("p1", "p2"), "c1"), + Component("b", "B", "B", "y", ("p2", "p3"), "c2"), + ] + self.assertEqual(installer.missing_packages(comps), ["p1", "p2", "p3"]) + + def test_apt_command_includes_packages(self): + joined = " ".join(installer.apt_install_command(["smartmontools", "dmidecode"])) + self.assertIn("smartmontools", joined) + self.assertIn("dmidecode", joined) + self.assertIn("apt-get install", joined) + + def test_install_nothing_is_noop(self): + rc, _ = installer.install_packages([]) + self.assertEqual(rc, 0) + + +class UpdateTests(unittest.TestCase): + def test_is_newer(self): + self.assertTrue(is_newer("v0.0.5", "0.0.4")) + self.assertFalse(is_newer("v0.0.4", "0.0.4")) + self.assertFalse(is_newer("v0.0.3", "0.0.4")) + + def test_is_newer_handles_garbage(self): + self.assertFalse(is_newer("not-a-version", "0.0.4")) + + +if __name__ == "__main__": + unittest.main()