feat(health): GPU stress monitor + per-drive SMART health/wear
Two diagnostics for the load-correlated GPU crashes and for storage wear. GPU stress (`rigdoctor stress` + a System Health "Stress test…" dialog): drive a GPU load and sample sensors at high rate, then report per-metric min/avg/peak, time spent above each temp threshold, power vs limit, throttling (decoded from the NVML clocks-event bitmask), and any GPU fault (Xid / VA-space freeze / query-timeout hang) in the window. Load source: explicit --command, an auto-detected loader, or monitor-only (you launch the game). Analysis is a pure, unit-tested function. Drive health (core/drives.py): parse full `smartctl --json` per drive into prioritized findings — SMART verdict, derived life-left % (NVMe percentage_used or SATA wear-leveling), power-on hours, TBW, temperature, and failure predictors (reallocated/pending/offline sectors, NVMe media errors, low spare). Replaces the old pass/fail-only check_smart; runs through the same elevated path (collect-priv / sudo), degrading to "needs root" notes unprivileged. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
"""Tests for drive health parsing & findings (synthetic smartctl JSON)."""
|
||||
|
||||
import unittest
|
||||
from dataclasses import asdict
|
||||
|
||||
from rigdoctor.core import drives
|
||||
from rigdoctor.core.health import CRITICAL, INFO, OK, WARNING
|
||||
|
||||
_NVME_OK = {
|
||||
"model_name": "Samsung SSD 980 PRO 1TB",
|
||||
"device": {"protocol": "NVMe"},
|
||||
"smart_status": {"passed": True},
|
||||
"temperature": {"current": 41},
|
||||
"power_on_time": {"hours": 1234},
|
||||
"nvme_smart_health_information_log": {
|
||||
"percentage_used": 3, "available_spare": 100, "available_spare_threshold": 10,
|
||||
"media_errors": 0, "data_units_written": 200_000_000, # ~102 TB
|
||||
},
|
||||
}
|
||||
|
||||
_NVME_WORN = {
|
||||
"model_name": "Worn NVMe",
|
||||
"device": {"protocol": "NVMe"},
|
||||
"smart_status": {"passed": True},
|
||||
"nvme_smart_health_information_log": {"percentage_used": 96, "available_spare": 100,
|
||||
"available_spare_threshold": 10},
|
||||
}
|
||||
|
||||
_SATA_FAILING = {
|
||||
"model_name": "Samsung SSD 870 QVO 1TB",
|
||||
"device": {"protocol": "ATA"},
|
||||
"smart_status": {"passed": False},
|
||||
"temperature": {"current": 35},
|
||||
"power_on_time": {"hours": 5000},
|
||||
"ata_smart_attributes": {"table": [
|
||||
{"id": 5, "name": "Reallocated_Sector_Ct", "value": 80, "raw": {"value": 12}},
|
||||
{"id": 177, "name": "Wear_Leveling_Count", "value": 88, "raw": {"value": 300}},
|
||||
{"id": 241, "name": "Total_LBAs_Written", "value": 99, "raw": {"value": 2_000_000_000}},
|
||||
]},
|
||||
}
|
||||
|
||||
|
||||
class ParseTests(unittest.TestCase):
|
||||
def test_nvme_parse(self):
|
||||
d = drives.parse("/dev/nvme0", _NVME_OK)
|
||||
self.assertEqual(d.kind, "nvme")
|
||||
self.assertTrue(d.passed)
|
||||
self.assertEqual(d.percent_used, 3)
|
||||
self.assertEqual(d.health_pct, 97) # 100 - percentage_used
|
||||
self.assertEqual(d.power_on_hours, 1234)
|
||||
self.assertEqual(d.temp_c, 41)
|
||||
self.assertAlmostEqual(d.data_written_tb, 102.4, places=1)
|
||||
|
||||
def test_sata_parse(self):
|
||||
d = drives.parse("/dev/sda", _SATA_FAILING)
|
||||
self.assertEqual(d.kind, "sata")
|
||||
self.assertFalse(d.passed)
|
||||
self.assertEqual(d.reallocated, 12) # raw value
|
||||
self.assertEqual(d.health_pct, 88) # normalized wear-leveling value
|
||||
self.assertAlmostEqual(d.data_written_tb, 1.02, places=1)
|
||||
|
||||
def test_needs_root_when_no_data(self):
|
||||
d = drives.parse("/dev/sda", None)
|
||||
self.assertTrue(d.needs_root)
|
||||
|
||||
def test_roundtrip_through_dicts(self):
|
||||
d = drives.parse("/dev/nvme0", _NVME_OK)
|
||||
back = drives.from_dicts([asdict(d)])
|
||||
self.assertEqual(len(back), 1)
|
||||
self.assertEqual(back[0].model, d.model)
|
||||
self.assertEqual(back[0].health_pct, d.health_pct)
|
||||
|
||||
|
||||
class FindingTests(unittest.TestCase):
|
||||
def test_healthy_nvme_is_ok_with_stats(self):
|
||||
f = drives.to_findings([drives.parse("/dev/nvme0", _NVME_OK)])[0]
|
||||
self.assertEqual(f.severity, OK)
|
||||
self.assertIn("97% life left", f.title)
|
||||
self.assertIn("1,234 h", f.title)
|
||||
|
||||
def test_failing_sata_is_critical(self):
|
||||
f = drives.to_findings([drives.parse("/dev/sda", _SATA_FAILING)])[0]
|
||||
self.assertEqual(f.severity, CRITICAL)
|
||||
self.assertIn("FAILED", f.detail)
|
||||
self.assertIn("reallocated sectors", f.detail)
|
||||
|
||||
def test_worn_nvme_is_warning(self):
|
||||
f = drives.to_findings([drives.parse("/dev/nvme1", _NVME_WORN)])[0]
|
||||
self.assertEqual(f.severity, WARNING)
|
||||
self.assertIn("worn", f.title)
|
||||
|
||||
def test_needs_root_is_info(self):
|
||||
f = drives.to_findings([drives.parse("/dev/sda", None)])[0]
|
||||
self.assertEqual(f.severity, INFO)
|
||||
self.assertIn("needs root", f.title)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user