"""Tests for the GPU stress + thermal-monitor analysis (synthetic ticks, no real GPU).""" import unittest from rigdoctor.core import stress from rigdoctor.core.health import CRITICAL, OK, WARNING def _tick(temp=None, power=None, throttle=(), capped=False, lost=False, dt=1.0, **extra): values = {} if temp is not None: values["gpu.temp"] = temp if power is not None: values["gpu.power"] = power values.update(extra) return stress._Tick(dt=dt, values=values, throttle=list(throttle), power_capped=capped, lost=lost) class SummarizeTests(unittest.TestCase): def test_stable_run_is_ok(self): ticks = [_tick(temp=t, power=200, **{"gpu.power_limit": 280}) for t in (60, 65, 70, 72)] r = stress.summarize(ticks, load="monitor-only", interval=1.0, faults=[]) self.assertEqual(r.severity, OK) self.assertEqual(r.peak_temp, 72) self.assertEqual(r.max_power, 200) self.assertEqual(r.power_limit, 280) self.assertFalse(r.throttled) self.assertIn("Stable", r.verdict) def test_dwell_time_above_thresholds(self): # 3 ticks of 2s each at 82/86/92 °C → ≥80 for all 6s, ≥85 for 4s, ≥90 for 2s. ticks = [_tick(temp=82, dt=2.0), _tick(temp=86, dt=2.0), _tick(temp=92, dt=2.0)] r = stress.summarize(ticks, load="x", interval=2.0, faults=[]) self.assertEqual(r.time_above[80], 6.0) self.assertEqual(r.time_above[85], 4.0) self.assertEqual(r.time_above[90], 2.0) self.assertNotIn(95, r.time_above) # never reached → omitted def test_throttling_is_a_warning(self): ticks = [_tick(temp=88, throttle=["HW thermal slowdown"])] r = stress.summarize(ticks, load="x", interval=1.0, faults=[]) self.assertEqual(r.severity, WARNING) self.assertTrue(r.throttled) self.assertIn("HW thermal slowdown", r.throttle_reasons) def test_high_temp_without_throttle_is_a_warning(self): r = stress.summarize([_tick(temp=93)], load="x", interval=1.0, faults=[]) self.assertEqual(r.severity, WARNING) self.assertIn("hot", r.verdict.lower()) def test_gpu_lost_is_critical(self): ticks = [_tick(temp=70), _tick(lost=True)] r = stress.summarize(ticks, load="x", interval=1.0, faults=[]) self.assertEqual(r.severity, CRITICAL) self.assertTrue(r.gpu_lost) def test_journal_fault_is_critical(self): r = stress.summarize([_tick(temp=70)], load="x", interval=1.0, faults=["NVIDIA Xid 79 ×1"]) self.assertEqual(r.severity, CRITICAL) self.assertIn("Xid 79", r.verdict) def test_no_telemetry_is_info(self): r = stress.summarize([_tick()], load="monitor-only", interval=1.0, faults=[]) self.assertEqual(r.severity, "info") self.assertIsNone(r.peak_temp) class ThrottleDecodeTests(unittest.TestCase): def test_throttle_bits_map_to_reasons(self): # the constants used by _throttle_state decode the NVML active-reasons bitmask self.assertIn("HW thermal slowdown", stress._THROTTLE_BITS.values()) self.assertIn("SW thermal slowdown", stress._THROTTLE_BITS.values()) if __name__ == "__main__": unittest.main()