"""Health report (M4): scan kernel logs + SMART + driver/library state into a
prioritized, plain-language findings list with suggested fixes (read-only, D9).

Stdlib-only. Every check degrades gracefully — a missing tool/permission yields an
info finding, never an exception.
"""

from __future__ import annotations

import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path

CRITICAL = "critical"
WARNING = "warning"
INFO = "info"
OK = "ok"
_ORDER = {CRITICAL: 0, WARNING: 1, INFO: 2, OK: 3}


@dataclass
class Finding:
    severity: str   # critical | warning | info | ok
    category: str   # GPU, Kernel, Memory, Storage, Thermal, Driver, PCIe, Logs
    title: str
    detail: str = ""
    suggestion: str = ""
    action: str = ""  # optional: id of an installable catalog component (for an Install button)
    fix: str = ""     # optional: id of an applyable runtime tunable (for an Apply dropdown, M6)


# --- NVIDIA Xid knowledge (the seed crash is Xid 79) --------------------------
_XID_INFO: dict[int, tuple[str, str]] = {
    13: (WARNING, "Graphics engine exception (often an app/driver bug or unstable overclock)"),
    31: (WARNING, "GPU memory page fault (usually a driver or application bug)"),
    43: (WARNING, "GPU stopped processing a task (application error)"),
    45: (INFO, "Preemptive channel removal (often a side-effect of another error or a reboot)"),
    48: (CRITICAL, "Double-bit ECC error — VRAM hardware fault"),
    62: (CRITICAL, "Internal microcontroller halt (often follows instability)"),
    79: (CRITICAL, "GPU has fallen off the bus — hardware: power delivery, PCIe link, or thermals"),
    94: (CRITICAL, "Contained ECC error"),
    95: (CRITICAL, "Uncontained ECC error"),
    119: (CRITICAL, "GSP RPC timeout — GPU System Processor hang"),
    120: (CRITICAL, "GSP error — GPU System Processor fault"),
}
_XID_SUGGEST: dict[int, str] = {
    79: "Check PSU/power cables and reseat the GPU/riser; test a lower power limit "
        "(`sudo nvidia-smi -pl <watts>`) and capture a session with `rigdoctor record`.",
    48: "Persistent VRAM ECC errors mean failing memory — RMA the card if it recurs.",
    119: "GSP hangs are often driver-version specific — try a different driver branch.",
    120: "GSP errors are often driver-version specific — try a different driver branch.",
}
_XID_RE = re.compile(r"Xid(?:\s*\([^)]*\))?:?\s*(\d+)")


def scan_journal_text(text: str) -> list[Finding]:
    """Parse kernel-log text into findings (separated from IO so it's testable)."""
    lines = text.splitlines()
    findings: list[Finding] = []

    xids: dict[int, int] = {}
    for line in lines:
        if "Xid" in line:
            m = _XID_RE.search(line)
            if m:
                code = int(m.group(1))
                xids[code] = xids.get(code, 0) + 1
    for code in sorted(xids):
        severity, desc = _XID_INFO.get(code, (WARNING, f"NVIDIA GPU error (Xid {code})"))
        suggest = _XID_SUGGEST.get(code, "Look up this Xid code in NVIDIA's Xid error documentation.")
        findings.append(Finding(severity, "GPU", f"NVIDIA Xid {code} ×{xids[code]}", desc, suggest))

    oom = sum(1 for ln in lines if "Out of memory" in ln or "oom-kill" in ln or "oom_reaper" in ln)
    if oom:
        findings.append(Finding(
            WARNING, "Memory", f"Out-of-memory kills ×{oom}",
            "The kernel killed processes to reclaim RAM.",
            "Close memory-heavy apps, add zram/swap, or investigate a leak.",
        ))

    if any("Kernel panic" in ln for ln in lines):
        findings.append(Finding(
            CRITICAL, "Kernel", "Kernel panic recorded",
            "The kernel hit an unrecoverable error.",
            "Note the panic message; review recent driver/kernel updates and hardware.",
        ))

    if any("mce:" in ln or "Machine check" in ln or "Hardware Error" in ln for ln in lines):
        findings.append(Finding(
            CRITICAL, "Hardware", "Machine Check Exception (MCE)",
            "The CPU reported a hardware error.",
            "Run memtest86 for RAM, check CPU temps/voltages, and review the MCE detail.",
        ))

    if any("AER:" in ln or "PCIe Bus Error" in ln or ("pcieport" in ln and "error" in ln.lower()) for ln in lines):
        findings.append(Finding(
            WARNING, "PCIe", "PCIe bus errors (AER)",
            "Correctable/uncorrectable PCIe errors were logged.",
            "Reseat the device and check risers/cabling; AER storms can precede a GPU drop.",
        ))

    low = [ln.lower() for ln in lines]
    if any(("thermal" in ln and ("critical" in ln or "throttl" in ln)) or "temperature above threshold" in ln for ln in low):
        findings.append(Finding(
            WARNING, "Thermal", "Thermal events logged",
            "The system logged thermal throttling / critical-temperature events.",
            "Improve airflow/cooling and check fan curves; watch live temps on the dashboard.",
        ))

    if any("amdgpu" in ln and "reset" in ln for ln in low):
        findings.append(Finding(
            CRITICAL, "GPU", "AMD GPU reset (amdgpu)",
            "The AMD GPU was reset after a hang.",
            "Check power/thermals/driver; capture a session with `rigdoctor record`.",
        ))

    return findings


def _journalctl(args: list[str]) -> str | None:
    if shutil.which("journalctl") is None:
        return None
    try:
        proc = subprocess.run(["journalctl", *args], capture_output=True, text=True, timeout=25)
        return proc.stdout
    except (subprocess.SubprocessError, OSError):
        return None


def check_journal() -> list[Finding]:
    out = _journalctl(["-k", "--no-pager", "-o", "cat", "--since", "-7 days"])
    if out is None:
        return [Finding(
            INFO, "Logs", "Couldn't read the kernel journal",
            "journalctl is unavailable or not readable.",
            "Ensure systemd/journald is present and your user is in the 'systemd-journal' or 'adm' group.",
        )]
    findings = scan_journal_text(out)
    if not findings:
        findings.append(Finding(
            OK, "Logs", "No notable kernel errors (last 7 days)",
            "No Xid, panic, OOM, MCE, PCIe AER, or thermal events found.",
        ))
    return findings


def check_previous_boot() -> list[Finding]:
    """Scan the previous boot's kernel log — the boot that crashed — for fault signatures.

    Needs persistent journald (else the crashed boot's logs were lost on reboot, which the
    persistence check flags separately). Findings are framed as coming from that boot.
    """
    out = _journalctl(["-k", "-b", "-1", "--no-pager", "-o", "cat"])
    if not out or not out.strip():
        return []
    tagged = []
    for f in scan_journal_text(out):
        detail = ("Logged during the previous (crashed) boot. " + (f.detail or "")).strip()
        tagged.append(Finding(f.severity, f.category, f.title, detail, f.suggestion))
    return tagged


def check_journal_persistence() -> list[Finding]:
    if Path("/var/log/journal").is_dir():
        return []
    return [Finding(
        WARNING, "Logs", "journald isn't persistent across reboots",
        "Crash-boot kernel logs are discarded on reboot, so a hard freeze's evidence can vanish.",
        "Enable persistent logging: `sudo mkdir -p /var/log/journal && sudo systemctl restart systemd-journald`",
    )]


def check_nvidia_driver() -> list[Finding]:
    if shutil.which("nvidia-smi") is None:
        return []
    try:
        proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, timeout=10)
    except (subprocess.SubprocessError, OSError):
        return []
    if "Driver/library version mismatch" in (proc.stdout + proc.stderr):
        return [Finding(
            CRITICAL, "Driver", "NVIDIA driver/library version mismatch",
            "The loaded kernel module and the userspace NVIDIA libraries differ — GPU monitoring will fail until resolved.",
            "Reboot to load the matching module (or finish the interrupted driver update).",
        )]
    return []


def _smart_devices() -> list[str]:
    try:
        proc = subprocess.run(["smartctl", "--scan"], capture_output=True, text=True, timeout=10)
    except (subprocess.SubprocessError, OSError):
        return []
    devices = []
    for line in proc.stdout.splitlines():
        line = line.strip()
        if line.startswith("/dev/"):
            devices.append(line.split()[0])
    return devices


def check_smart() -> list[Finding]:
    if shutil.which("smartctl") is None:
        return [Finding(
            INFO, "Storage", "SMART not checked (smartmontools missing)",
            "Disk self-health couldn't be read.",
            "Install it for disk health checks: `sudo apt install smartmontools`",
        )]
    devices = _smart_devices()
    if not devices:
        return [Finding(
            INFO, "Storage", "SMART: couldn't enumerate drives",
            "Reading SMART usually needs root.",
            "Run: `sudo rigdoctor report`",
        )]
    findings: list[Finding] = []
    for dev in devices:
        try:
            proc = subprocess.run(["smartctl", "-H", dev], capture_output=True, text=True, timeout=15)
        except (subprocess.SubprocessError, OSError):
            continue
        combined = proc.stdout + proc.stderr
        if "Permission denied" in combined or "requires root" in combined.lower():
            findings.append(Finding(INFO, "Storage", f"SMART for {dev} needs root", "", "Run: `sudo rigdoctor report`"))
        elif "PASSED" in combined:
            findings.append(Finding(OK, "Storage", f"SMART OK: {dev}", "Overall-health self-assessment passed."))
        elif "FAILED" in combined or "FAILING_NOW" in combined:
            findings.append(Finding(CRITICAL, "Storage", f"SMART FAILED: {dev}", "The drive reports failing health.", "Back up now and replace the drive."))
    return findings


def check_live_temps() -> list[Finding]:
    from .sampler import Sampler
    from .sources import available_sources

    sample = Sampler(available_sources()).sample()
    hot = [
        (r.source, r.label or r.metric, r.value)
        for r in sample.readings
        if r.unit == "°C" and r.value is not None and r.value >= 90
    ]
    if not hot:
        return []
    worst = max(hot, key=lambda x: x[2])
    detail = "; ".join(f"{s} {label} {v:.0f}°C" for s, label, v in hot)
    return [Finding(
        WARNING, "Thermal", f"High temperature right now ({worst[2]:.0f}°C)",
        detail, "Check cooling/airflow and reduce load.",
    )]


def run_health_checks(include_journal: bool = True) -> list[Finding]:
    """Run all checks and return findings sorted by severity (worst first).

    SMART needs root; if the session collected it via launch elevation, use that
    instead of re-running smartctl (which would just report "needs root").

    `include_journal=False` skips the 7-day kernel-journal scan — used by the crash
    analysis, which scans the previous (crashed) boot specifically instead.
    """
    from . import elevation

    findings: list[Finding] = []
    findings += check_nvidia_driver()
    if include_journal:
        findings += check_journal()
    findings += check_journal_persistence()
    priv = elevation.privileged()
    if priv is not None and priv.get("smart") is not None:
        findings += [Finding(**d) for d in priv["smart"]]
    else:
        findings += check_smart()
    findings += check_live_temps()
    findings.sort(key=lambda f: _ORDER.get(f.severity, 9))
    return findings