Files
rigdoctor/tests/test_health.py
T
jessey 07bc722209
tests / core (pull_request) Successful in 12s
tests / gui-smoke (pull_request) Successful in 27s
feat(health): flag NVMe PCIe links below capability (lane-sharing) — 0.38.0
check_pcie_links() warns when an NVMe drive negotiates fewer lanes than it
supports — almost always motherboard lane-sharing (a GPU/second card or another
M.2 stealing lanes), the case the user asked about — and reports speed-only
reductions as info (slower slot / idle ASPM). GPU is excluded: NVIDIA drops its
PCIe gen+width at idle, so a snapshot would false-alarm. Reuses inventory
read_link/nvme_controllers (refactored to public). Wired into run_health_checks;
+tests. Folded into the 0.38.0 PCIe work.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 16:49:47 +02:00

86 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the M4 health report's log scanner (synthetic input)."""
import unittest
from pathlib import Path
from unittest import mock
from rigdoctor.core import health
from rigdoctor.core.health import (
CRITICAL,
INFO,
WARNING,
check_pcie_links,
run_health_checks,
scan_journal_text,
)
class HealthScanTests(unittest.TestCase):
def test_xid_79_is_critical(self):
text = "NVRM: Xid (PCI:0000:01:00): 79, pid=1234, GPU has fallen off the bus."
findings = scan_journal_text(text)
gpu = [f for f in findings if f.category == "GPU"]
self.assertEqual(len(gpu), 1)
self.assertIn("79", gpu[0].title)
self.assertEqual(gpu[0].severity, CRITICAL)
def test_xid_count_aggregates(self):
text = "\n".join(["NVRM: Xid (PCI:0000:01:00): 79, foo"] * 3)
gpu = [f for f in scan_journal_text(text) if f.category == "GPU"][0]
self.assertIn("×3", gpu.title)
def test_oom_and_panic_detected(self):
text = "Out of memory: Killed process 999 (game)\nKernel panic - not syncing: x"
cats = {f.category for f in scan_journal_text(text)}
self.assertIn("Memory", cats)
self.assertIn("Kernel", cats)
def test_mce_critical(self):
findings = scan_journal_text("mce: [Hardware Error]: Machine check events logged")
self.assertTrue(any(f.severity == CRITICAL and f.category == "Hardware" for f in findings))
def test_clean_text_yields_no_findings(self):
self.assertEqual(scan_journal_text("usb 1-1: new high-speed USB device\nbluetooth: ok"), [])
def test_run_health_checks_returns_findings(self):
# Runs against the real system; just assert it returns a sorted list of Findings.
findings = run_health_checks()
self.assertIsInstance(findings, list)
severities = [f.severity for f in findings]
order = {"critical": 0, "warning": 1, "info": 2, "ok": 3}
ranks = [order.get(s, 9) for s in severities]
self.assertEqual(ranks, sorted(ranks))
class PcieLinkCheckTests(unittest.TestCase):
def _with_link(self, cur_g, cur_w, max_g, max_w):
# one fake NVMe controller returning the given link tuple
return (mock.patch("rigdoctor.core.inventory.nvme_controllers",
return_value=[("nvme0", Path("/x"))]),
mock.patch("rigdoctor.core.inventory.read_link",
return_value=(cur_g, cur_w, max_g, max_w)))
def test_reduced_width_is_a_warning_about_lane_sharing(self):
ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4
with ctrls, link:
findings = check_pcie_links()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, WARNING)
self.assertIn("lane-sharing", findings[0].detail)
def test_reduced_speed_only_is_info(self):
ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4
with ctrls, link:
findings = check_pcie_links()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, INFO)
def test_full_speed_no_finding(self):
ctrls, link = self._with_link(4, "4", 4, "4")
with ctrls, link:
self.assertEqual(check_pcie_links(), [])
if __name__ == "__main__":
unittest.main()