feat(health): flag NVMe PCIe links below capability (lane-sharing) — 0.38.0 #41

Merged
jessey merged 1 commits from feat/inventory-pcie into main 2026-05-22 14:51:13 +00:00
4 changed files with 102 additions and 7 deletions
Showing only changes of commit 07bc722209 - Show all commits
+5
View File
@@ -12,6 +12,11 @@ release tag (so the auto-updater, D18, can compare versions).
(`current/max_link_speed` + width). If a drive negotiates below its capability (a slower M.2 (`current/max_link_speed` + width). If a drive negotiates below its capability (a slower M.2
slot, lane-sharing, or a downtrain) it's flagged: `PCIe Gen3 x4 (capable of Gen4 x4)`. So you slot, lane-sharing, or a downtrain) it's flagged: `PCIe Gen3 x4 (capable of Gen4 x4)`. So you
can confirm a Gen4 SSD is actually in a Gen4 slot. (SATA disks show no PCIe link.) can confirm a Gen4 SSD is actually in a Gen4 slot. (SATA disks show no PCIe link.)
- **System Health flags downtrained NVMe links.** A new check warns when an NVMe drive negotiates
fewer PCIe lanes than it supports (almost always motherboard **lane-sharing** — a GPU/second
card or another M.2 stealing lanes) and notes speed-only reductions as info (a slower slot or
idle ASPM). The GPU is deliberately excluded — NVIDIA drops its PCIe gen/width at idle, so a
snapshot would false-alarm.
## [0.37.1] - 2026-05-22 ## [0.37.1] - 2026-05-22
### Fixed ### Fixed
+33
View File
@@ -251,6 +251,38 @@ def check_live_temps() -> list[Finding]:
)] )]
def check_pcie_links() -> list[Finding]:
"""Flag NVMe drives linked below their PCIe capability — a slower slot or, most often,
motherboard lane-sharing where a GPU/second card or another M.2 steals lanes from the slot.
Width reductions are reliable (reported as warnings); speed-only reductions are info (they can
also be normal link power management at idle). The GPU is intentionally not checked here:
NVIDIA drops its PCIe gen *and* width at idle, so a point-in-time snapshot is misleading.
"""
from . import inventory
findings: list[Finding] = []
for name, dev in inventory.nvme_controllers():
cur_g, cur_w, max_g, max_w = inventory.read_link(dev)
if not cur_g or not max_g:
continue
if max_w and cur_w and cur_w != max_w: # fewer lanes → almost always lane-sharing
findings.append(Finding(
WARNING, "PCIe", f"{name} linked at x{cur_w} (supports x{max_w})",
f"{name} negotiated PCIe Gen{cur_g} x{cur_w}, but the drive supports "
f"Gen{max_g} x{max_w}. Fewer lanes is usually motherboard lane-sharing — a GPU or a "
"second card in a PCIe slot, or another populated M.2, can steal lanes from this slot.",
"Check your board manual's lane-sharing table; move the drive to a full-x4 "
"(often CPU-attached) M.2 slot."))
elif cur_g < max_g: # full width but a lower generation → slower slot or idle ASPM
findings.append(Finding(
INFO, "PCIe", f"{name} linked at Gen{cur_g} (supports Gen{max_g})",
f"{name} negotiated PCIe Gen{cur_g} but supports Gen{max_g}. This can be a slower "
"(chipset or older) M.2 slot, or normal link power management (ASPM) at idle.",
"If you expect full speed, check the slot and the BIOS PCIe/ASPM settings."))
return findings
def run_health_checks(include_journal: bool = True) -> list[Finding]: def run_health_checks(include_journal: bool = True) -> list[Finding]:
"""Run all checks and return findings sorted by severity (worst first). """Run all checks and return findings sorted by severity (worst first).
@@ -273,5 +305,6 @@ def run_health_checks(include_journal: bool = True) -> list[Finding]:
else: else:
findings += check_smart() findings += check_smart()
findings += check_live_temps() findings += check_live_temps()
findings += check_pcie_links()
findings.sort(key=lambda f: _ORDER.get(f.severity, 9)) findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
return findings return findings
+24 -6
View File
@@ -134,11 +134,10 @@ def _gen(speed: str) -> int | None:
return _PCIE_GEN.get(tok) return _PCIE_GEN.get(tok)
def _link_desc(dev: Path) -> str: def read_link(dev: Path) -> tuple[int | None, str, int | None, str]:
"""Describe a PCI device's negotiated PCIe link, noting if it's below its max. """Negotiated/max PCIe link for a PCI device dir: (cur_gen, cur_width, max_gen, max_width).
e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a Widths are the raw sysfs strings (e.g. '4'); gens are ints (4) or None when unreadable.
slower slot.
""" """
def rd(name: str) -> str: def rd(name: str) -> str:
try: try:
@@ -146,8 +145,17 @@ def _link_desc(dev: Path) -> str:
except OSError: except OSError:
return "" return ""
cur_g, cur_w = _gen(rd("current_link_speed")), rd("current_link_width") return (_gen(rd("current_link_speed")), rd("current_link_width"),
max_g, max_w = _gen(rd("max_link_speed")), rd("max_link_width") _gen(rd("max_link_speed")), rd("max_link_width"))
def _link_desc(dev: Path) -> str:
"""Describe a PCI device's negotiated PCIe link, noting if it's below its max.
e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a
slower slot.
"""
cur_g, cur_w, max_g, max_w = read_link(dev)
if not cur_g or not cur_w: if not cur_g or not cur_w:
return "" return ""
desc = f"PCIe Gen{cur_g} x{cur_w}" desc = f"PCIe Gen{cur_g} x{cur_w}"
@@ -156,6 +164,16 @@ def _link_desc(dev: Path) -> str:
return desc return desc
def nvme_controllers() -> list[tuple[str, Path]]:
"""Each NVMe controller as (name, pci-device-dir), e.g. ('nvme0', /sys/.../device)."""
base = Path("/sys/class/nvme")
try:
entries = [p for p in base.iterdir() if re.fullmatch(r"nvme\d+", p.name)]
except OSError:
return []
return sorted((p.name, p / "device") for p in entries)
def _nvme_link(block_name: str) -> str: def _nvme_link(block_name: str) -> str:
"""PCIe link for an NVMe block device (nvme0n1 → controller nvme0); '' for non-NVMe.""" """PCIe link for an NVMe block device (nvme0n1 → controller nvme0); '' for non-NVMe."""
m = re.match(r"(nvme\d+)", block_name) m = re.match(r"(nvme\d+)", block_name)
+40 -1
View File
@@ -1,8 +1,18 @@
"""Tests for the M4 health report's log scanner (synthetic input).""" """Tests for the M4 health report's log scanner (synthetic input)."""
import unittest import unittest
from pathlib import Path
from unittest import mock
from rigdoctor.core.health import CRITICAL, WARNING, run_health_checks, scan_journal_text from rigdoctor.core import health
from rigdoctor.core.health import (
CRITICAL,
INFO,
WARNING,
check_pcie_links,
run_health_checks,
scan_journal_text,
)
class HealthScanTests(unittest.TestCase): class HealthScanTests(unittest.TestCase):
@@ -42,5 +52,34 @@ class HealthScanTests(unittest.TestCase):
self.assertEqual(ranks, sorted(ranks)) self.assertEqual(ranks, sorted(ranks))
class PcieLinkCheckTests(unittest.TestCase):
def _with_link(self, cur_g, cur_w, max_g, max_w):
# one fake NVMe controller returning the given link tuple
return (mock.patch("rigdoctor.core.inventory.nvme_controllers",
return_value=[("nvme0", Path("/x"))]),
mock.patch("rigdoctor.core.inventory.read_link",
return_value=(cur_g, cur_w, max_g, max_w)))
def test_reduced_width_is_a_warning_about_lane_sharing(self):
ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4
with ctrls, link:
findings = check_pcie_links()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, WARNING)
self.assertIn("lane-sharing", findings[0].detail)
def test_reduced_speed_only_is_info(self):
ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4
with ctrls, link:
findings = check_pcie_links()
self.assertEqual(len(findings), 1)
self.assertEqual(findings[0].severity, INFO)
def test_full_speed_no_finding(self):
ctrls, link = self._with_link(4, "4", 4, "4")
with ctrls, link:
self.assertEqual(check_pcie_links(), [])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()