edc2166011
Two diagnostics for the load-correlated GPU crashes and for storage wear. GPU stress (`rigdoctor stress` + a System Health "Stress test…" dialog): drive a GPU load and sample sensors at high rate, then report per-metric min/avg/peak, time spent above each temp threshold, power vs limit, throttling (decoded from the NVML clocks-event bitmask), and any GPU fault (Xid / VA-space freeze / query-timeout hang) in the window. Load source: explicit --command, an auto-detected loader, or monitor-only (you launch the game). Analysis is a pure, unit-tested function. Drive health (core/drives.py): parse full `smartctl --json` per drive into prioritized findings — SMART verdict, derived life-left % (NVMe percentage_used or SATA wear-leveling), power-on hours, TBW, temperature, and failure predictors (reallocated/pending/offline sectors, NVMe media errors, low spare). Replaces the old pass/fail-only check_smart; runs through the same elevated path (collect-priv / sudo), degrading to "needs root" notes unprivileged. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
78 lines
3.2 KiB
Python
78 lines
3.2 KiB
Python
"""Tests for the GPU stress + thermal-monitor analysis (synthetic ticks, no real GPU)."""
|
||
|
||
import unittest
|
||
|
||
from rigdoctor.core import stress
|
||
from rigdoctor.core.health import CRITICAL, OK, WARNING
|
||
|
||
|
||
def _tick(temp=None, power=None, throttle=(), capped=False, lost=False, dt=1.0, **extra):
|
||
values = {}
|
||
if temp is not None:
|
||
values["gpu.temp"] = temp
|
||
if power is not None:
|
||
values["gpu.power"] = power
|
||
values.update(extra)
|
||
return stress._Tick(dt=dt, values=values, throttle=list(throttle), power_capped=capped, lost=lost)
|
||
|
||
|
||
class SummarizeTests(unittest.TestCase):
|
||
def test_stable_run_is_ok(self):
|
||
ticks = [_tick(temp=t, power=200, **{"gpu.power_limit": 280}) for t in (60, 65, 70, 72)]
|
||
r = stress.summarize(ticks, load="monitor-only", interval=1.0, faults=[])
|
||
self.assertEqual(r.severity, OK)
|
||
self.assertEqual(r.peak_temp, 72)
|
||
self.assertEqual(r.max_power, 200)
|
||
self.assertEqual(r.power_limit, 280)
|
||
self.assertFalse(r.throttled)
|
||
self.assertIn("Stable", r.verdict)
|
||
|
||
def test_dwell_time_above_thresholds(self):
|
||
# 3 ticks of 2s each at 82/86/92 °C → ≥80 for all 6s, ≥85 for 4s, ≥90 for 2s.
|
||
ticks = [_tick(temp=82, dt=2.0), _tick(temp=86, dt=2.0), _tick(temp=92, dt=2.0)]
|
||
r = stress.summarize(ticks, load="x", interval=2.0, faults=[])
|
||
self.assertEqual(r.time_above[80], 6.0)
|
||
self.assertEqual(r.time_above[85], 4.0)
|
||
self.assertEqual(r.time_above[90], 2.0)
|
||
self.assertNotIn(95, r.time_above) # never reached → omitted
|
||
|
||
def test_throttling_is_a_warning(self):
|
||
ticks = [_tick(temp=88, throttle=["HW thermal slowdown"])]
|
||
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
|
||
self.assertEqual(r.severity, WARNING)
|
||
self.assertTrue(r.throttled)
|
||
self.assertIn("HW thermal slowdown", r.throttle_reasons)
|
||
|
||
def test_high_temp_without_throttle_is_a_warning(self):
|
||
r = stress.summarize([_tick(temp=93)], load="x", interval=1.0, faults=[])
|
||
self.assertEqual(r.severity, WARNING)
|
||
self.assertIn("hot", r.verdict.lower())
|
||
|
||
def test_gpu_lost_is_critical(self):
|
||
ticks = [_tick(temp=70), _tick(lost=True)]
|
||
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
|
||
self.assertEqual(r.severity, CRITICAL)
|
||
self.assertTrue(r.gpu_lost)
|
||
|
||
def test_journal_fault_is_critical(self):
|
||
r = stress.summarize([_tick(temp=70)], load="x", interval=1.0,
|
||
faults=["NVIDIA Xid 79 ×1"])
|
||
self.assertEqual(r.severity, CRITICAL)
|
||
self.assertIn("Xid 79", r.verdict)
|
||
|
||
def test_no_telemetry_is_info(self):
|
||
r = stress.summarize([_tick()], load="monitor-only", interval=1.0, faults=[])
|
||
self.assertEqual(r.severity, "info")
|
||
self.assertIsNone(r.peak_temp)
|
||
|
||
|
||
class ThrottleDecodeTests(unittest.TestCase):
|
||
def test_throttle_bits_map_to_reasons(self):
|
||
# the constants used by _throttle_state decode the NVML active-reasons bitmask
|
||
self.assertIn("HW thermal slowdown", stress._THROTTLE_BITS.values())
|
||
self.assertIn("SW thermal slowdown", stress._THROTTLE_BITS.values())
|
||
|
||
|
||
if __name__ == "__main__":
|
||
unittest.main()
|