feat(health): GPU stress monitor + per-drive SMART health/wear
Two diagnostics for the load-correlated GPU crashes and for storage wear. GPU stress (`rigdoctor stress` + a System Health "Stress test…" dialog): drive a GPU load and sample sensors at high rate, then report per-metric min/avg/peak, time spent above each temp threshold, power vs limit, throttling (decoded from the NVML clocks-event bitmask), and any GPU fault (Xid / VA-space freeze / query-timeout hang) in the window. Load source: explicit --command, an auto-detected loader, or monitor-only (you launch the game). Analysis is a pure, unit-tested function. Drive health (core/drives.py): parse full `smartctl --json` per drive into prioritized findings — SMART verdict, derived life-left % (NVMe percentage_used or SATA wear-leveling), power-on hours, TBW, temperature, and failure predictors (reallocated/pending/offline sectors, NVMe media errors, low spare). Replaces the old pass/fail-only check_smart; runs through the same elevated path (collect-priv / sudo), degrading to "needs root" notes unprivileged. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
"""Tests for drive health parsing & findings (synthetic smartctl JSON)."""
|
||||
|
||||
import unittest
|
||||
from dataclasses import asdict
|
||||
|
||||
from rigdoctor.core import drives
|
||||
from rigdoctor.core.health import CRITICAL, INFO, OK, WARNING
|
||||
|
||||
_NVME_OK = {
|
||||
"model_name": "Samsung SSD 980 PRO 1TB",
|
||||
"device": {"protocol": "NVMe"},
|
||||
"smart_status": {"passed": True},
|
||||
"temperature": {"current": 41},
|
||||
"power_on_time": {"hours": 1234},
|
||||
"nvme_smart_health_information_log": {
|
||||
"percentage_used": 3, "available_spare": 100, "available_spare_threshold": 10,
|
||||
"media_errors": 0, "data_units_written": 200_000_000, # ~102 TB
|
||||
},
|
||||
}
|
||||
|
||||
_NVME_WORN = {
|
||||
"model_name": "Worn NVMe",
|
||||
"device": {"protocol": "NVMe"},
|
||||
"smart_status": {"passed": True},
|
||||
"nvme_smart_health_information_log": {"percentage_used": 96, "available_spare": 100,
|
||||
"available_spare_threshold": 10},
|
||||
}
|
||||
|
||||
_SATA_FAILING = {
|
||||
"model_name": "Samsung SSD 870 QVO 1TB",
|
||||
"device": {"protocol": "ATA"},
|
||||
"smart_status": {"passed": False},
|
||||
"temperature": {"current": 35},
|
||||
"power_on_time": {"hours": 5000},
|
||||
"ata_smart_attributes": {"table": [
|
||||
{"id": 5, "name": "Reallocated_Sector_Ct", "value": 80, "raw": {"value": 12}},
|
||||
{"id": 177, "name": "Wear_Leveling_Count", "value": 88, "raw": {"value": 300}},
|
||||
{"id": 241, "name": "Total_LBAs_Written", "value": 99, "raw": {"value": 2_000_000_000}},
|
||||
]},
|
||||
}
|
||||
|
||||
|
||||
class ParseTests(unittest.TestCase):
|
||||
def test_nvme_parse(self):
|
||||
d = drives.parse("/dev/nvme0", _NVME_OK)
|
||||
self.assertEqual(d.kind, "nvme")
|
||||
self.assertTrue(d.passed)
|
||||
self.assertEqual(d.percent_used, 3)
|
||||
self.assertEqual(d.health_pct, 97) # 100 - percentage_used
|
||||
self.assertEqual(d.power_on_hours, 1234)
|
||||
self.assertEqual(d.temp_c, 41)
|
||||
self.assertAlmostEqual(d.data_written_tb, 102.4, places=1)
|
||||
|
||||
def test_sata_parse(self):
|
||||
d = drives.parse("/dev/sda", _SATA_FAILING)
|
||||
self.assertEqual(d.kind, "sata")
|
||||
self.assertFalse(d.passed)
|
||||
self.assertEqual(d.reallocated, 12) # raw value
|
||||
self.assertEqual(d.health_pct, 88) # normalized wear-leveling value
|
||||
self.assertAlmostEqual(d.data_written_tb, 1.02, places=1)
|
||||
|
||||
def test_needs_root_when_no_data(self):
|
||||
d = drives.parse("/dev/sda", None)
|
||||
self.assertTrue(d.needs_root)
|
||||
|
||||
def test_roundtrip_through_dicts(self):
|
||||
d = drives.parse("/dev/nvme0", _NVME_OK)
|
||||
back = drives.from_dicts([asdict(d)])
|
||||
self.assertEqual(len(back), 1)
|
||||
self.assertEqual(back[0].model, d.model)
|
||||
self.assertEqual(back[0].health_pct, d.health_pct)
|
||||
|
||||
|
||||
class FindingTests(unittest.TestCase):
|
||||
def test_healthy_nvme_is_ok_with_stats(self):
|
||||
f = drives.to_findings([drives.parse("/dev/nvme0", _NVME_OK)])[0]
|
||||
self.assertEqual(f.severity, OK)
|
||||
self.assertIn("97% life left", f.title)
|
||||
self.assertIn("1,234 h", f.title)
|
||||
|
||||
def test_failing_sata_is_critical(self):
|
||||
f = drives.to_findings([drives.parse("/dev/sda", _SATA_FAILING)])[0]
|
||||
self.assertEqual(f.severity, CRITICAL)
|
||||
self.assertIn("FAILED", f.detail)
|
||||
self.assertIn("reallocated sectors", f.detail)
|
||||
|
||||
def test_worn_nvme_is_warning(self):
|
||||
f = drives.to_findings([drives.parse("/dev/nvme1", _NVME_WORN)])[0]
|
||||
self.assertEqual(f.severity, WARNING)
|
||||
self.assertIn("worn", f.title)
|
||||
|
||||
def test_needs_root_is_info(self):
|
||||
f = drives.to_findings([drives.parse("/dev/sda", None)])[0]
|
||||
self.assertEqual(f.severity, INFO)
|
||||
self.assertIn("needs root", f.title)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,77 @@
|
||||
"""Tests for the GPU stress + thermal-monitor analysis (synthetic ticks, no real GPU)."""
|
||||
|
||||
import unittest
|
||||
|
||||
from rigdoctor.core import stress
|
||||
from rigdoctor.core.health import CRITICAL, OK, WARNING
|
||||
|
||||
|
||||
def _tick(temp=None, power=None, throttle=(), capped=False, lost=False, dt=1.0, **extra):
|
||||
values = {}
|
||||
if temp is not None:
|
||||
values["gpu.temp"] = temp
|
||||
if power is not None:
|
||||
values["gpu.power"] = power
|
||||
values.update(extra)
|
||||
return stress._Tick(dt=dt, values=values, throttle=list(throttle), power_capped=capped, lost=lost)
|
||||
|
||||
|
||||
class SummarizeTests(unittest.TestCase):
|
||||
def test_stable_run_is_ok(self):
|
||||
ticks = [_tick(temp=t, power=200, **{"gpu.power_limit": 280}) for t in (60, 65, 70, 72)]
|
||||
r = stress.summarize(ticks, load="monitor-only", interval=1.0, faults=[])
|
||||
self.assertEqual(r.severity, OK)
|
||||
self.assertEqual(r.peak_temp, 72)
|
||||
self.assertEqual(r.max_power, 200)
|
||||
self.assertEqual(r.power_limit, 280)
|
||||
self.assertFalse(r.throttled)
|
||||
self.assertIn("Stable", r.verdict)
|
||||
|
||||
def test_dwell_time_above_thresholds(self):
|
||||
# 3 ticks of 2s each at 82/86/92 °C → ≥80 for all 6s, ≥85 for 4s, ≥90 for 2s.
|
||||
ticks = [_tick(temp=82, dt=2.0), _tick(temp=86, dt=2.0), _tick(temp=92, dt=2.0)]
|
||||
r = stress.summarize(ticks, load="x", interval=2.0, faults=[])
|
||||
self.assertEqual(r.time_above[80], 6.0)
|
||||
self.assertEqual(r.time_above[85], 4.0)
|
||||
self.assertEqual(r.time_above[90], 2.0)
|
||||
self.assertNotIn(95, r.time_above) # never reached → omitted
|
||||
|
||||
def test_throttling_is_a_warning(self):
|
||||
ticks = [_tick(temp=88, throttle=["HW thermal slowdown"])]
|
||||
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
|
||||
self.assertEqual(r.severity, WARNING)
|
||||
self.assertTrue(r.throttled)
|
||||
self.assertIn("HW thermal slowdown", r.throttle_reasons)
|
||||
|
||||
def test_high_temp_without_throttle_is_a_warning(self):
|
||||
r = stress.summarize([_tick(temp=93)], load="x", interval=1.0, faults=[])
|
||||
self.assertEqual(r.severity, WARNING)
|
||||
self.assertIn("hot", r.verdict.lower())
|
||||
|
||||
def test_gpu_lost_is_critical(self):
|
||||
ticks = [_tick(temp=70), _tick(lost=True)]
|
||||
r = stress.summarize(ticks, load="x", interval=1.0, faults=[])
|
||||
self.assertEqual(r.severity, CRITICAL)
|
||||
self.assertTrue(r.gpu_lost)
|
||||
|
||||
def test_journal_fault_is_critical(self):
|
||||
r = stress.summarize([_tick(temp=70)], load="x", interval=1.0,
|
||||
faults=["NVIDIA Xid 79 ×1"])
|
||||
self.assertEqual(r.severity, CRITICAL)
|
||||
self.assertIn("Xid 79", r.verdict)
|
||||
|
||||
def test_no_telemetry_is_info(self):
|
||||
r = stress.summarize([_tick()], load="monitor-only", interval=1.0, faults=[])
|
||||
self.assertEqual(r.severity, "info")
|
||||
self.assertIsNone(r.peak_temp)
|
||||
|
||||
|
||||
class ThrottleDecodeTests(unittest.TestCase):
|
||||
def test_throttle_bits_map_to_reasons(self):
|
||||
# the constants used by _throttle_state decode the NVML active-reasons bitmask
|
||||
self.assertIn("HW thermal slowdown", stress._THROTTLE_BITS.values())
|
||||
self.assertIn("SW thermal slowdown", stress._THROTTLE_BITS.values())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user