Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b20e8dfc3a | |||
| 9fe9a6576f | |||
|
07bc722209
|
|||
|
81c7757546
|
@@ -12,6 +12,11 @@ release tag (so the auto-updater, D18, can compare versions).
|
|||||||
(`current/max_link_speed` + width). If a drive negotiates below its capability (a slower M.2
|
(`current/max_link_speed` + width). If a drive negotiates below its capability (a slower M.2
|
||||||
slot, lane-sharing, or a downtrain) it's flagged: `PCIe Gen3 x4 (capable of Gen4 x4)`. So you
|
slot, lane-sharing, or a downtrain) it's flagged: `PCIe Gen3 x4 (capable of Gen4 x4)`. So you
|
||||||
can confirm a Gen4 SSD is actually in a Gen4 slot. (SATA disks show no PCIe link.)
|
can confirm a Gen4 SSD is actually in a Gen4 slot. (SATA disks show no PCIe link.)
|
||||||
|
- **System Health flags downtrained NVMe links.** A new check warns when an NVMe drive negotiates
|
||||||
|
fewer PCIe lanes than it supports (almost always motherboard **lane-sharing** — a GPU/second
|
||||||
|
card or another M.2 stealing lanes) and notes speed-only reductions as info (a slower slot or
|
||||||
|
idle ASPM). The GPU is deliberately excluded — NVIDIA drops its PCIe gen/width at idle, so a
|
||||||
|
snapshot would false-alarm.
|
||||||
|
|
||||||
## [0.37.1] - 2026-05-22
|
## [0.37.1] - 2026-05-22
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|||||||
@@ -51,30 +51,20 @@ apt pulls the GUI dependencies (PySide6, pyte) automatically:
|
|||||||
sudo apt install ./rigdoctor_*_all.deb # CLI only: add --no-install-recommends
|
sudo apt install ./rigdoctor_*_all.deb # CLI only: add --no-install-recommends
|
||||||
```
|
```
|
||||||
|
|
||||||
**Or add the apt repository** for `apt install` + automatic updates. The registry is public and
|
**Or add the apt repository** for `apt install` + automatic updates (the registry is public and
|
||||||
GPG-signed — no token needed; just add the signing key and a deb822 source:
|
GPG-signed — no token needed):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# signing key → dearmored into the keyring
|
sudo curl https://git.jesseyvanofferen.com/api/packages/jessey/debian/repository.key -o /etc/apt/keyrings/gitea-jessey.asc
|
||||||
sudo install -d -m 0755 /etc/apt/keyrings
|
echo "deb [arch=all signed-by=/etc/apt/keyrings/gitea-jessey.asc] https://git.jesseyvanofferen.com/api/packages/jessey/debian stable main" | sudo tee /etc/apt/sources.list.d/gitea.list
|
||||||
curl -fsSL https://git.jesseyvanofferen.com/api/packages/jessey/debian/repository.key \
|
sudo apt update
|
||||||
| sudo gpg --dearmor -o /etc/apt/keyrings/gitea-jessey.gpg
|
sudo apt install rigdoctor
|
||||||
|
|
||||||
# the source (modern deb822 format, GPG-verified, all-arch)
|
|
||||||
sudo tee /etc/apt/sources.list.d/rigdoctor.sources >/dev/null <<'EOF'
|
|
||||||
Types: deb
|
|
||||||
URIs: https://git.jesseyvanofferen.com/api/packages/jessey/debian
|
|
||||||
Suites: stable
|
|
||||||
Components: main
|
|
||||||
Architectures: all
|
|
||||||
Signed-By: /etc/apt/keyrings/gitea-jessey.gpg
|
|
||||||
EOF
|
|
||||||
|
|
||||||
sudo apt update && sudo apt install rigdoctor
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Then `sudo apt upgrade` keeps it current.
|
Then `sudo apt upgrade` keeps it current.
|
||||||
|
|
||||||
|
Then `sudo apt upgrade` keeps it current.
|
||||||
|
|
||||||
### Any distro — self-extracting `.run` (no root)
|
### Any distro — self-extracting `.run` (no root)
|
||||||
|
|
||||||
Download **`rigdoctor-<version>-installer.run`** from the releases page and run it. It installs
|
Download **`rigdoctor-<version>-installer.run`** from the releases page and run it. It installs
|
||||||
|
|||||||
@@ -251,6 +251,38 @@ def check_live_temps() -> list[Finding]:
|
|||||||
)]
|
)]
|
||||||
|
|
||||||
|
|
||||||
|
def check_pcie_links() -> list[Finding]:
|
||||||
|
"""Flag NVMe drives linked below their PCIe capability — a slower slot or, most often,
|
||||||
|
motherboard lane-sharing where a GPU/second card or another M.2 steals lanes from the slot.
|
||||||
|
|
||||||
|
Width reductions are reliable (reported as warnings); speed-only reductions are info (they can
|
||||||
|
also be normal link power management at idle). The GPU is intentionally not checked here:
|
||||||
|
NVIDIA drops its PCIe gen *and* width at idle, so a point-in-time snapshot is misleading.
|
||||||
|
"""
|
||||||
|
from . import inventory
|
||||||
|
|
||||||
|
findings: list[Finding] = []
|
||||||
|
for name, dev in inventory.nvme_controllers():
|
||||||
|
cur_g, cur_w, max_g, max_w = inventory.read_link(dev)
|
||||||
|
if not cur_g or not max_g:
|
||||||
|
continue
|
||||||
|
if max_w and cur_w and cur_w != max_w: # fewer lanes → almost always lane-sharing
|
||||||
|
findings.append(Finding(
|
||||||
|
WARNING, "PCIe", f"{name} linked at x{cur_w} (supports x{max_w})",
|
||||||
|
f"{name} negotiated PCIe Gen{cur_g} x{cur_w}, but the drive supports "
|
||||||
|
f"Gen{max_g} x{max_w}. Fewer lanes is usually motherboard lane-sharing — a GPU or a "
|
||||||
|
"second card in a PCIe slot, or another populated M.2, can steal lanes from this slot.",
|
||||||
|
"Check your board manual's lane-sharing table; move the drive to a full-x4 "
|
||||||
|
"(often CPU-attached) M.2 slot."))
|
||||||
|
elif cur_g < max_g: # full width but a lower generation → slower slot or idle ASPM
|
||||||
|
findings.append(Finding(
|
||||||
|
INFO, "PCIe", f"{name} linked at Gen{cur_g} (supports Gen{max_g})",
|
||||||
|
f"{name} negotiated PCIe Gen{cur_g} but supports Gen{max_g}. This can be a slower "
|
||||||
|
"(chipset or older) M.2 slot, or normal link power management (ASPM) at idle.",
|
||||||
|
"If you expect full speed, check the slot and the BIOS PCIe/ASPM settings."))
|
||||||
|
return findings
|
||||||
|
|
||||||
|
|
||||||
def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
||||||
"""Run all checks and return findings sorted by severity (worst first).
|
"""Run all checks and return findings sorted by severity (worst first).
|
||||||
|
|
||||||
@@ -273,5 +305,6 @@ def run_health_checks(include_journal: bool = True) -> list[Finding]:
|
|||||||
else:
|
else:
|
||||||
findings += check_smart()
|
findings += check_smart()
|
||||||
findings += check_live_temps()
|
findings += check_live_temps()
|
||||||
|
findings += check_pcie_links()
|
||||||
findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
|
findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
|
||||||
return findings
|
return findings
|
||||||
|
|||||||
@@ -134,11 +134,10 @@ def _gen(speed: str) -> int | None:
|
|||||||
return _PCIE_GEN.get(tok)
|
return _PCIE_GEN.get(tok)
|
||||||
|
|
||||||
|
|
||||||
def _link_desc(dev: Path) -> str:
|
def read_link(dev: Path) -> tuple[int | None, str, int | None, str]:
|
||||||
"""Describe a PCI device's negotiated PCIe link, noting if it's below its max.
|
"""Negotiated/max PCIe link for a PCI device dir: (cur_gen, cur_width, max_gen, max_width).
|
||||||
|
|
||||||
e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a
|
Widths are the raw sysfs strings (e.g. '4'); gens are ints (4) or None when unreadable.
|
||||||
slower slot.
|
|
||||||
"""
|
"""
|
||||||
def rd(name: str) -> str:
|
def rd(name: str) -> str:
|
||||||
try:
|
try:
|
||||||
@@ -146,8 +145,17 @@ def _link_desc(dev: Path) -> str:
|
|||||||
except OSError:
|
except OSError:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
cur_g, cur_w = _gen(rd("current_link_speed")), rd("current_link_width")
|
return (_gen(rd("current_link_speed")), rd("current_link_width"),
|
||||||
max_g, max_w = _gen(rd("max_link_speed")), rd("max_link_width")
|
_gen(rd("max_link_speed")), rd("max_link_width"))
|
||||||
|
|
||||||
|
|
||||||
|
def _link_desc(dev: Path) -> str:
|
||||||
|
"""Describe a PCI device's negotiated PCIe link, noting if it's below its max.
|
||||||
|
|
||||||
|
e.g. 'PCIe Gen4 x4', or 'PCIe Gen3 x4 (capable of Gen4 x4)' when downtrained / in a
|
||||||
|
slower slot.
|
||||||
|
"""
|
||||||
|
cur_g, cur_w, max_g, max_w = read_link(dev)
|
||||||
if not cur_g or not cur_w:
|
if not cur_g or not cur_w:
|
||||||
return ""
|
return ""
|
||||||
desc = f"PCIe Gen{cur_g} x{cur_w}"
|
desc = f"PCIe Gen{cur_g} x{cur_w}"
|
||||||
@@ -156,6 +164,16 @@ def _link_desc(dev: Path) -> str:
|
|||||||
return desc
|
return desc
|
||||||
|
|
||||||
|
|
||||||
|
def nvme_controllers() -> list[tuple[str, Path]]:
|
||||||
|
"""Each NVMe controller as (name, pci-device-dir), e.g. ('nvme0', /sys/.../device)."""
|
||||||
|
base = Path("/sys/class/nvme")
|
||||||
|
try:
|
||||||
|
entries = [p for p in base.iterdir() if re.fullmatch(r"nvme\d+", p.name)]
|
||||||
|
except OSError:
|
||||||
|
return []
|
||||||
|
return sorted((p.name, p / "device") for p in entries)
|
||||||
|
|
||||||
|
|
||||||
def _nvme_link(block_name: str) -> str:
|
def _nvme_link(block_name: str) -> str:
|
||||||
"""PCIe link for an NVMe block device (nvme0n1 → controller nvme0); '' for non-NVMe."""
|
"""PCIe link for an NVMe block device (nvme0n1 → controller nvme0); '' for non-NVMe."""
|
||||||
m = re.match(r"(nvme\d+)", block_name)
|
m = re.match(r"(nvme\d+)", block_name)
|
||||||
|
|||||||
+40
-1
@@ -1,8 +1,18 @@
|
|||||||
"""Tests for the M4 health report's log scanner (synthetic input)."""
|
"""Tests for the M4 health report's log scanner (synthetic input)."""
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
from rigdoctor.core.health import CRITICAL, WARNING, run_health_checks, scan_journal_text
|
from rigdoctor.core import health
|
||||||
|
from rigdoctor.core.health import (
|
||||||
|
CRITICAL,
|
||||||
|
INFO,
|
||||||
|
WARNING,
|
||||||
|
check_pcie_links,
|
||||||
|
run_health_checks,
|
||||||
|
scan_journal_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class HealthScanTests(unittest.TestCase):
|
class HealthScanTests(unittest.TestCase):
|
||||||
@@ -42,5 +52,34 @@ class HealthScanTests(unittest.TestCase):
|
|||||||
self.assertEqual(ranks, sorted(ranks))
|
self.assertEqual(ranks, sorted(ranks))
|
||||||
|
|
||||||
|
|
||||||
|
class PcieLinkCheckTests(unittest.TestCase):
|
||||||
|
def _with_link(self, cur_g, cur_w, max_g, max_w):
|
||||||
|
# one fake NVMe controller returning the given link tuple
|
||||||
|
return (mock.patch("rigdoctor.core.inventory.nvme_controllers",
|
||||||
|
return_value=[("nvme0", Path("/x"))]),
|
||||||
|
mock.patch("rigdoctor.core.inventory.read_link",
|
||||||
|
return_value=(cur_g, cur_w, max_g, max_w)))
|
||||||
|
|
||||||
|
def test_reduced_width_is_a_warning_about_lane_sharing(self):
|
||||||
|
ctrls, link = self._with_link(4, "2", 4, "4") # Gen4 x2 but supports x4
|
||||||
|
with ctrls, link:
|
||||||
|
findings = check_pcie_links()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, WARNING)
|
||||||
|
self.assertIn("lane-sharing", findings[0].detail)
|
||||||
|
|
||||||
|
def test_reduced_speed_only_is_info(self):
|
||||||
|
ctrls, link = self._with_link(3, "4", 4, "4") # Gen3 x4 but supports Gen4
|
||||||
|
with ctrls, link:
|
||||||
|
findings = check_pcie_links()
|
||||||
|
self.assertEqual(len(findings), 1)
|
||||||
|
self.assertEqual(findings[0].severity, INFO)
|
||||||
|
|
||||||
|
def test_full_speed_no_finding(self):
|
||||||
|
ctrls, link = self._with_link(4, "4", 4, "4")
|
||||||
|
with ctrls, link:
|
||||||
|
self.assertEqual(check_pcie_links(), [])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user