Files
jessey edc2166011 feat(health): GPU stress monitor + per-drive SMART health/wear
Two diagnostics for the load-correlated GPU crashes and for storage wear.

GPU stress (`rigdoctor stress` + a System Health "Stress test…" dialog): drive a GPU
load and sample sensors at high rate, then report per-metric min/avg/peak, time spent
above each temp threshold, power vs limit, throttling (decoded from the NVML
clocks-event bitmask), and any GPU fault (Xid / VA-space freeze / query-timeout hang)
in the window. Load source: explicit --command, an auto-detected loader, or
monitor-only (you launch the game). Analysis is a pure, unit-tested function.

Drive health (core/drives.py): parse full `smartctl --json` per drive into prioritized
findings — SMART verdict, derived life-left % (NVMe percentage_used or SATA
wear-leveling), power-on hours, TBW, temperature, and failure predictors
(reallocated/pending/offline sectors, NVMe media errors, low spare). Replaces the old
pass/fail-only check_smart; runs through the same elevated path (collect-priv / sudo),
degrading to "needs root" notes unprivileged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 16:59:06 +02:00

78 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the GPU stress + thermal-monitor analysis (synthetic ticks, no real GPU)."""
import unittest
from rigdoctor.core import stress
from rigdoctor.core.health import CRITICAL, OK, WARNING
def _tick(temp=None, power=None, throttle=(), capped=False, lost=False, dt=1.0, **extra):
values = {}
if temp is not None:
values["gpu.temp"] = temp
if power is not None:
values["gpu.power"] = power
values.update(extra)
return stress._Tick(dt=dt, values=values, throttle=list(throttle), power_capped=capped, lost=lost)
class SummarizeTests(unittest.TestCase):
def test_stable_run_is_ok(self):
ticks = [_tick(temp=t, power=200, **{"gpu.power_limit": 280}) for t in (60, 65, 70, 72)]
r = stress.summarize(ticks, load="monitor-only", interval=1.0, faults=[])
self.assertEqual(r.severity, OK)
self.assertEqual(r.peak_temp, 72)
self.assertEqual(r.max_power, 200)
self.assertEqual(r.power_limit, 280)
self.assertFalse(r.throttled)
self.assertIn("Stable", r.verdict)
def test_dwell_time_above_thresholds(self):
# 3 ticks of 2s each at 82/86/92 °C → ≥80 for all 6s, ≥85 for 4s, ≥90 for 2s.
ticks = [_tick(temp=82, dt=2.0), _tick(temp=86, dt=2.0), _tick(temp=92, dt=2.0)]
r = stress.summarize(ticks, load="x", interval=2.0, faults=[])
self.assertEqual(r.time_above[80], 6.0)
self.assertEqual(r.time_above[85], 4.0)
self.assertEqual(r.time_above[90], 2.0)
self.assertNotIn(95, r.time_above) # never reached → omitted
def test_throttling_is_a_warning(self):
ticks = [_tick(temp=88, throttle=["HW thermal slowdown"])]
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
self.assertEqual(r.severity, WARNING)
self.assertTrue(r.throttled)
self.assertIn("HW thermal slowdown", r.throttle_reasons)
def test_high_temp_without_throttle_is_a_warning(self):
r = stress.summarize([_tick(temp=93)], load="x", interval=1.0, faults=[])
self.assertEqual(r.severity, WARNING)
self.assertIn("hot", r.verdict.lower())
def test_gpu_lost_is_critical(self):
ticks = [_tick(temp=70), _tick(lost=True)]
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
self.assertEqual(r.severity, CRITICAL)
self.assertTrue(r.gpu_lost)
def test_journal_fault_is_critical(self):
r = stress.summarize([_tick(temp=70)], load="x", interval=1.0,
faults=["NVIDIA Xid 79 ×1"])
self.assertEqual(r.severity, CRITICAL)
self.assertIn("Xid 79", r.verdict)
def test_no_telemetry_is_info(self):
r = stress.summarize([_tick()], load="monitor-only", interval=1.0, faults=[])
self.assertEqual(r.severity, "info")
self.assertIsNone(r.peak_temp)
class ThrottleDecodeTests(unittest.TestCase):
def test_throttle_bits_map_to_reasons(self):
# the constants used by _throttle_state decode the NVML active-reasons bitmask
self.assertIn("HW thermal slowdown", stress._THROTTLE_BITS.values())
self.assertIn("SW thermal slowdown", stress._THROTTLE_BITS.values())
if __name__ == "__main__":
unittest.main()