Merge pull request 'feat: live monitor TUI (M2) — 0.21.0' (#17 ) from feat/m11-tray into main

Reviewed-on: #17
feat: live monitor TUI (M2) — 0.21.0
2026-05-22 07:38:17 +00:00 · 2026-05-22 09:37:57 +02:00
8 changed files with 250 additions and 16 deletions
@@ -5,6 +5,14 @@ All notable changes to RigDoctor are recorded here. Format follows
 (`MAJOR.MINOR.PATCH`, pre-1.0). `__version__` and `pyproject.toml` must match the git
 release tag (so the auto-updater, D18, can compare versions).

+## [0.21.0] - 2026-05-22
+### Added
+- **Live monitor TUI (M2).** `rigdoctor monitor` is now a proper **curses** dashboard:
+  current / session-min / session-max per sensor, grouped by subsystem, with temperature and
+  utilization **color bands** (and GPU-lost flagged red). `q` quits, `r` resets the session
+  min/max. Falls back to a plain full-screen redraw on a non-TTY (`--plain` forces it). The
+  terminal face of the same live data the GUI dashboard graphs. Completes the Monitoring bundle.
+
 ## [0.20.0] - 2026-05-22
 ### Changed
 - **Reorganized navigation** into grouped sidebar sections — **Monitor** (Dashboard) ·
@@ -11,7 +11,7 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done
 | M1 | Sensor core | Essential | none (nvidia-smi, sysfs) | all (NVIDIA first) | P0 | ✅ |
 | M3 | Crash-capture logger | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | ✅ |
 | M4 | Health report (log scan) | Essential | none (opt: smartmontools) | all (NVIDIA first) | P0 | ✅ |
-| M2 | Live monitor (TUI) | Monitoring | none (stdlib curses) | all | P1 | ⬜ |
+| M2 | Live monitor (TUI) | Monitoring | none (stdlib curses) | all | P1 | ✅ |
 | M8 | Alerting | Monitoring | libnotify (opt) | all | P2 | ✅ |
 | M5 | System inventory | Diagnostics | none (opt: lm-sensors, dmidecode) | all | P1 | ✅ |
 | M6 | Gaming env checks | Diagnostics | none | all | P2 | 🟨 |
@@ -41,7 +41,10 @@ Status: ⬜ not started · 🟦 designing · 🟨 in progress · ✅ done
  findings (see SPEC §4). *Implemented:* journalctl scan (Xid/panic/OOM/MCE/AER/thermal/amdgpu),
  SMART, NVIDIA driver-mismatch, journald-persistence + live-temp checks; `rigdoctor report`
  (text/JSON) + GUI Health tab. GPU-firmware verification deferred.
- **M2 Live monitor** — depends on M1; the terminal "HWMonitor for Linux" face. Stdlib-only.
+- **M2 Live monitor** — the terminal "HWMonitor for Linux" face. *Implemented (`tui.py`):*
+  `rigdoctor monitor` is a stdlib **curses** dashboard — current / session-min / session-max
+  per sensor, grouped by subsystem, with temperature & utilization color bands; `q` quits,
+  `r` resets the min/max. Falls back to a plain redraw on a non-TTY (`--plain` forces it).
 - **M5 / M6 Diagnostics** — inventory export + gaming-env checks; M6 flags risky settings and
  suggests the fix command but does not apply it (D9). *M6 implemented (Steam detection first —
  the D12 "pick a game" foundation):* discovers Steam installs + all library folders
@@ -22,7 +22,8 @@ Ubuntu + NVIDIA first; `.deb` distribution (see `DECISIONS.md`).
  last readings + a plausible cause.

 ## Phase 2 — Live monitor (terminal)
- [ ] M2 TUI dashboard (current/min/max, grouped, throttle highlighting)
+- [x] M2 TUI dashboard (`rigdoctor monitor`, `tui.py`): curses, current/min/max grouped by
+      subsystem with temp/usage color bands; q quit / r reset; plain-redraw fallback on non-TTY
 - [ ] M8 basic alerting (overheat/throttle/GPU-lost notifications)

 ## Phase 3 — Diagnostics breadth
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "rigdoctor"
-version = "0.20.0"
+version = "0.21.0"
 description = "Modular hardware monitoring & crash diagnostics for Linux gamers."
 readme = "README.md"
 requires-python = ">=3.11"
@@ -1,3 +1,3 @@
 """RigDoctor — modular hardware monitoring & crash diagnostics for Linux gamers."""

-__version__ = "0.20.0"
+__version__ = "0.21.0"
@@ -44,17 +44,10 @@ def cmd_snapshot(args) -> int:


 def cmd_monitor(args) -> int:
+    from .tui import run
+
    interval = args.interval or load_config()["interval"]
-    try:
-        for sample in _sampler().stream(interval=interval):
-            # Basic full-screen redraw; the rich TUI (M2) comes later.
-            print("\033[2J\033[H", end="")
-            print(f"RigDoctor — live  (every {interval:g}s, Ctrl-C to quit)\n")
-            print(render_snapshot(sample))
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        print()
-    return 0
+    return run(interval, plain=getattr(args, "plain", False))


 def cmd_gui(args) -> int:
@@ -516,8 +509,9 @@ def build_parser() -> argparse.ArgumentParser:
    sp.add_argument("--json", action="store_true", help="output JSON instead of text")
    sp.set_defaults(func=cmd_snapshot)

-    mp = sub.add_parser("monitor", help="live-refreshing sensor view")
+    mp = sub.add_parser("monitor", help="live monitor TUI (current/min/max, M2)")
    mp.add_argument("-n", "--interval", type=float, default=None, help="refresh interval (s)")
+    mp.add_argument("--plain", action="store_true", help="plain redraw instead of the curses UI")
    mp.set_defaults(func=cmd_monitor)

    sub.add_parser("gui", help="launch the desktop GUI (needs PySide6)").set_defaults(func=cmd_gui)
@@ -0,0 +1,170 @@
+"""Live monitor TUI (M2): a curses HWMonitor-style terminal dashboard.
+
+Shows current / session-min / session-max per sensor, grouped by subsystem, with
+temperature and utilization color bands. stdlib `curses` only; falls back to a plain
+full-screen redraw when stdout isn't a TTY (piped/SSH-without-tty). Keys: q quit, r reset
+the session min/max. The terminal face of the same live data the GUI dashboard graphs.
+"""
+
+from __future__ import annotations
+
+import curses
+import sys
+import time
+
+from .core.sample import Reading, Sample
+from .core.sampler import Sampler
+from .core.sources import available_sources
+from .render import _GROUP_ORDER, _GROUP_TITLES, format_raw, metric_label, render_snapshot
+
+# Color-band thresholds (mirror the GUI dashboard so both faces agree).
+TEMP_COLD, TEMP_WARN, TEMP_CRIT = 50.0, 78.0, 88.0
+USAGE_WARN, USAGE_CRIT = 85.0, 95.0
+_USAGE_METRICS = {"util", "used_pct", "mem_util", "load"}
+
+
+def band(r: Reading) -> str:
+    """Color band for a reading: cold | good | warn | crit | normal | na."""
+    if r.source == "gpu" and r.metric == "status":  # GPU-lost / query timeout
+        return "crit"
+    if r.value is None:
+        return "na"
+    if r.unit == "°C":
+        if r.value >= TEMP_CRIT:
+            return "crit"
+        if r.value >= TEMP_WARN:
+            return "warn"
+        if r.value >= TEMP_COLD:
+            return "good"
+        return "cold"
+    if r.unit == "%" and r.metric in _USAGE_METRICS:
+        if r.value >= USAGE_CRIT:
+            return "crit"
+        if r.value >= USAGE_WARN:
+            return "warn"
+        return "good"
+    return "normal"
+
+
+def track(stats: dict[str, tuple[float, float]], sample: Sample) -> None:
+    """Fold a sample's readings into {key: (min, max)} session extremes."""
+    for r in sample.readings:
+        if r.value is None:
+            continue
+        lo, hi = stats.get(r.key, (r.value, r.value))
+        stats[r.key] = (min(lo, r.value), max(hi, r.value))
+
+
+# --- curses front-end -----------------------------------------------------------------
+
+_BAND_PAIR = {"cold": 1, "good": 2, "warn": 3, "crit": 4}
+
+
+def _init_colors() -> None:
+    try:
+        curses.start_color()
+        curses.use_default_colors()
+        curses.init_pair(1, curses.COLOR_CYAN, -1)
+        curses.init_pair(2, curses.COLOR_GREEN, -1)
+        curses.init_pair(3, curses.COLOR_YELLOW, -1)
+        curses.init_pair(4, curses.COLOR_RED, -1)
+    except curses.error:
+        pass
+
+
+def _attr(band_name: str) -> int:
+    pair = _BAND_PAIR.get(band_name)
+    if not pair:
+        return curses.A_NORMAL
+    attr = curses.color_pair(pair)
+    return attr | curses.A_BOLD if band_name == "crit" else attr
+
+
+def _draw(stdscr, sample: Sample, stats: dict, interval: float) -> None:
+    stdscr.erase()
+    height, width = stdscr.getmaxyx()
+
+    def put(y: int, x: int, text: str, attr: int = curses.A_NORMAL) -> None:
+        if 0 <= y < height and 0 <= x < width:
+            try:
+                stdscr.addnstr(y, x, text, max(0, width - x - 1), attr)
+            except curses.error:
+                pass
+
+    put(0, 0, f"RigDoctor — live monitor   every {interval:g}s", curses.A_BOLD)
+    put(1, 0, "q quit    r reset min/max", curses.A_DIM)
+
+    groups = sample.by_source()
+    order = [k for k in _GROUP_ORDER if k in groups] + [k for k in groups if k not in _GROUP_ORDER]
+    name_w, col_w = 24, 11
+    y = 3
+    for key in order:
+        if y >= height:
+            break
+        put(y, 0, _GROUP_TITLES.get(key, key.title()), curses.A_BOLD)
+        y += 1
+        put(y, 2, f"{'sensor':<{name_w}}{'current':>{col_w}}{'min':>{col_w}}{'max':>{col_w}}", curses.A_DIM)
+        y += 1
+        for r in groups[key]:
+            if y >= height:
+                break
+            if r.metric == "name":  # device identity line
+                put(y, 2, str(r.label), curses.A_DIM)
+                y += 1
+                continue
+            lo, hi = stats.get(r.key, (r.value, r.value))
+            put(y, 2, f"{metric_label(r):<{name_w}}")
+            put(y, 2 + name_w, f"{format_raw(r.value, r.unit):>{col_w}}", _attr(band(r)))
+            put(y, 2 + name_w + col_w, f"{format_raw(lo, r.unit):>{col_w}}", curses.A_DIM)
+            put(y, 2 + name_w + 2 * col_w, f"{format_raw(hi, r.unit):>{col_w}}", curses.A_DIM)
+            y += 1
+        y += 1
+    stdscr.refresh()
+
+
+def _loop(stdscr, sampler: Sampler, interval: float) -> None:
+    curses.curs_set(0)
+    stdscr.nodelay(True)
+    _init_colors()
+    stats: dict[str, tuple[float, float]] = {}
+    latest = sampler.sample()
+    track(stats, latest)
+    next_sample = time.monotonic() + interval
+    while True:
+        ch = stdscr.getch()
+        if ch in (ord("q"), ord("Q")):
+            return
+        if ch in (ord("r"), ord("R")):
+            stats.clear()
+            track(stats, latest)
+        now = time.monotonic()
+        if now >= next_sample:
+            latest = sampler.sample()
+            track(stats, latest)
+            next_sample = now + interval
+        _draw(stdscr, latest, stats, interval)
+        time.sleep(0.05)  # keep key handling responsive without busy-spinning
+
+
+def _run_plain(sampler: Sampler, interval: float) -> int:
+    """Fallback for non-TTY output: clear + reprint each tick (no curses)."""
+    try:
+        for sample in sampler.stream(interval=interval):
+            print("\033[2J\033[H", end="")
+            print(f"RigDoctor — live  (every {interval:g}s, Ctrl-C to quit)\n")
+            print(render_snapshot(sample))
+            sys.stdout.flush()
+    except KeyboardInterrupt:
+        print()
+    return 0
+
+
+def run(interval: float, plain: bool = False) -> int:
+    sampler = Sampler(available_sources())
+    if plain or not sys.stdout.isatty():
+        return _run_plain(sampler, interval)
+    try:
+        curses.wrapper(_loop, sampler, interval)
+    except curses.error:  # terminal can't do curses — degrade gracefully
+        return _run_plain(sampler, interval)
+    return 0
@@ -0,0 +1,58 @@
+"""Tests for the M2 live-monitor TUI logic (min/max tracking + color bands)."""
+
+import unittest
+
+from rigdoctor import tui
+from rigdoctor.core.sample import Reading, Sample
+
+
+def _temp(v):
+    return Reading("gpu", "temp", v, "°C", "")
+
+
+class TrackTests(unittest.TestCase):
+    def test_tracks_min_and_max(self):
+        stats: dict = {}
+        for v in (60.0, 80.0, 70.0, 55.0):
+            tui.track(stats, Sample(0.0, [_temp(v)]))
+        self.assertEqual(stats["gpu.temp"], (55.0, 80.0))
+
+    def test_ignores_none_values(self):
+        stats: dict = {}
+        tui.track(stats, Sample(0.0, [_temp(None)]))
+        self.assertEqual(stats, {})
+
+    def test_keys_separate_by_label(self):
+        stats: dict = {}
+        tui.track(stats, Sample(0.0, [
+            Reading("cpu", "temp", 50.0, "°C", "Core 0"),
+            Reading("cpu", "temp", 70.0, "°C", "Core 1"),
+        ]))
+        self.assertEqual(stats["cpu.temp.Core 0"], (50.0, 50.0))
+        self.assertEqual(stats["cpu.temp.Core 1"], (70.0, 70.0))
+
+
+class BandTests(unittest.TestCase):
+    def test_temperature_bands(self):
+        self.assertEqual(tui.band(_temp(40.0)), "cold")
+        self.assertEqual(tui.band(_temp(60.0)), "good")
+        self.assertEqual(tui.band(_temp(80.0)), "warn")
+        self.assertEqual(tui.band(_temp(90.0)), "crit")
+
+    def test_usage_bands(self):
+        self.assertEqual(tui.band(Reading("gpu", "util", 50.0, "%")), "good")
+        self.assertEqual(tui.band(Reading("gpu", "util", 88.0, "%")), "warn")
+        self.assertEqual(tui.band(Reading("memory", "used_pct", 96.0, "%")), "crit")
+
+    def test_non_metric_percentage_is_normal(self):
+        self.assertEqual(tui.band(Reading("gpu", "fan", 100.0, "%")), "normal")
+
+    def test_gpu_lost_is_crit(self):
+        self.assertEqual(tui.band(Reading("gpu", "status", None, "", "query-timeout")), "crit")
+
+    def test_missing_value_is_na(self):
+        self.assertEqual(tui.band(Reading("gpu", "power", None, "W")), "na")
+
+
+if __name__ == "__main__":
+    unittest.main()