diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d84e0d..a264089 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ release tag (so the auto-updater, D18, can compare versions). (`current/max_link_speed` + width). If a drive negotiates below its capability (a slower M.2 slot, lane-sharing, or a downtrain) it's flagged: `PCIe Gen3 x4 (capable of Gen4 x4)`. So you can confirm a Gen4 SSD is actually in a Gen4 slot. (SATA disks show no PCIe link.) +- **System Health flags downtrained NVMe links.** A new check warns when an NVMe drive negotiates + fewer PCIe lanes than it supports (almost always motherboard **lane-sharing** — a GPU/second + card or another M.2 stealing lanes) and notes speed-only reductions as info (a slower slot or + idle ASPM). The GPU is deliberately excluded — NVIDIA drops its PCIe gen/width at idle, so a + snapshot would false-alarm. ## [0.37.1] - 2026-05-22 ### Fixed diff --git a/src/rigdoctor/core/health.py b/src/rigdoctor/core/health.py index 3ce1907..c54a47b 100644 --- a/src/rigdoctor/core/health.py +++ b/src/rigdoctor/core/health.py @@ -251,6 +251,38 @@ def check_live_temps() -> list[Finding]: )] +def check_pcie_links() -> list[Finding]: + """Flag NVMe drives linked below their PCIe capability — a slower slot or, most often, + motherboard lane-sharing where a GPU/second card or another M.2 steals lanes from the slot. + + Width reductions are reliable (reported as warnings); speed-only reductions are info (they can + also be normal link power management at idle). The GPU is intentionally not checked here: + NVIDIA drops its PCIe gen *and* width at idle, so a point-in-time snapshot is misleading. + """ + from . import inventory + + findings: list[Finding] = [] + for name, dev in inventory.nvme_controllers(): + cur_g, cur_w, max_g, max_w = inventory.read_link(dev) + if not cur_g or not max_g: + continue + if max_w and cur_w and cur_w != max_w: # fewer lanes → almost always lane-sharing + findings.append(Finding( + WARNING, "PCIe", f"{name} linked at x{cur_w} (supports x{max_w})", + f"{name} negotiated PCIe Gen{cur_g} x{cur_w}, but the drive supports " + f"Gen{max_g} x{max_w}. Fewer lanes is usually motherboard lane-sharing — a GPU or a " + "second card in a PCIe slot, or another populated M.2, can steal lanes from this slot.", + "Check your board manual's lane-sharing table; move the drive to a full-x4 " + "(often CPU-attached) M.2 slot.")) + elif cur_g < max_g: # full width but a lower generation → slower slot or idle ASPM + findings.append(Finding( + INFO, "PCIe", f"{name} linked at Gen{cur_g} (supports Gen{max_g})", + f"{name} negotiated PCIe Gen{cur_g} but supports Gen{max_g}. This can be a slower " + "(chipset or older) M.2 slot, or normal link power management (ASPM) at idle.", + "If you expect full speed, check the slot and the BIOS PCIe/ASPM settings.")) + return findings + + def run_health_checks(include_journal: bool = True) -> list[Finding]: """Run all checks and return findings sorted by severity (worst first). @@ -273,5 +305,6 @@ def run_health_checks(include_journal: bool = True) -> list[Finding]: else: findings += check_smart() findings += check_live_temps() + findings += check_pcie_links() findings.sort(key=lambda f: _ORDER.get(f.severity, 9)) return findings diff --git a/src/rigdoctor/core/inventory.py b/src/rigdoctor/core/inventory.py index f0e43c5..330fca4 100644 --- a/src/rigdoctor/core/inventory.py +++ b/src/rigdoctor/core/inventory.py @@ -134,11 +134,10 @@ def _gen(speed: str) -> int | None: return _PCIE_GEN.get(tok) -def _link_desc(dev: Path) -> str: - """Describe a PCI device's negotiated PCIe link, noting if it's below its max. +def read_link(dev: Path) -> tuple[int | None, str, int | None, str]: + """Negotiated/max PCIe link for a PCI device dir: (cur_gen, cur_width, max_gen, max_width). - e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a - slower slot. + Widths are the raw sysfs strings (e.g. '4'); gens are ints (4) or None when unreadable. """ def rd(name: str) -> str: try: @@ -146,8 +145,17 @@ def _link_desc(dev: Path) -> str: except OSError: return "" - cur_g, cur_w = _gen(rd("current_link_speed")), rd("current_link_width") - max_g, max_w = _gen(rd("max_link_speed")), rd("max_link_width") + return (_gen(rd("current_link_speed")), rd("current_link_width"), + _gen(rd("max_link_speed")), rd("max_link_width")) + + +def _link_desc(dev: Path) -> str: + """Describe a PCI device's negotiated PCIe link, noting if it's below its max. + + e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a + slower slot. + """ + cur_g, cur_w, max_g, max_w = read_link(dev) if not cur_g or not cur_w: return "" desc = f"PCIe Gen{cur_g} x{cur_w}" @@ -156,6 +164,16 @@ def _link_desc(dev: Path) -> str: return desc +def nvme_controllers() -> list[tuple[str, Path]]: + """Each NVMe controller as (name, pci-device-dir), e.g. ('nvme0', /sys/.../device).""" + base = Path("/sys/class/nvme") + try: + entries = [p for p in base.iterdir() if re.fullmatch(r"nvme\d+", p.name)] + except OSError: + return [] + return sorted((p.name, p / "device") for p in entries) + + def _nvme_link(block_name: str) -> str: """PCIe link for an NVMe block device (nvme0n1 → controller nvme0); '' for non-NVMe.""" m = re.match(r"(nvme\d+)", block_name) diff --git a/tests/test_health.py b/tests/test_health.py index edf6e76..de6bf6e 100644 --- a/tests/test_health.py +++ b/tests/test_health.py @@ -1,8 +1,18 @@ """Tests for the M4 health report's log scanner (synthetic input).""" import unittest +from pathlib import Path +from unittest import mock -from rigdoctor.core.health import CRITICAL, WARNING, run_health_checks, scan_journal_text +from rigdoctor.core import health +from rigdoctor.core.health import ( + CRITICAL, + INFO, + WARNING, + check_pcie_links, + run_health_checks, + scan_journal_text, +) class HealthScanTests(unittest.TestCase): @@ -42,5 +52,34 @@ class HealthScanTests(unittest.TestCase): self.assertEqual(ranks, sorted(ranks)) +class PcieLinkCheckTests(unittest.TestCase): + def _with_link(self, cur_g, cur_w, max_g, max_w): + # one fake NVMe controller returning the given link tuple + return (mock.patch("rigdoctor.core.inventory.nvme_controllers", + return_value=[("nvme0", Path("/x"))]), + mock.patch("rigdoctor.core.inventory.read_link", + return_value=(cur_g, cur_w, max_g, max_w))) + + def test_reduced_width_is_a_warning_about_lane_sharing(self): + ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4 + with ctrls, link: + findings = check_pcie_links() + self.assertEqual(len(findings), 1) + self.assertEqual(findings[0].severity, WARNING) + self.assertIn("lane-sharing", findings[0].detail) + + def test_reduced_speed_only_is_info(self): + ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4 + with ctrls, link: + findings = check_pcie_links() + self.assertEqual(len(findings), 1) + self.assertEqual(findings[0].severity, INFO) + + def test_full_speed_no_finding(self): + ctrls, link = self._with_link(4, "4", 4, "4") + with ctrls, link: + self.assertEqual(check_pcie_links(), []) + + if __name__ == "__main__": unittest.main()