diff --git a/installation/nvidia-driver.md b/installation/nvidia-driver.md new file mode 100644 index 0000000..b943856 --- /dev/null +++ b/installation/nvidia-driver.md @@ -0,0 +1,201 @@ +You're currently on CT 108 (openclaw) console. Here is the **exact complete script** that was executed on your PVE host — you can save this and re-run it on any fresh PVE node with RTX 3060s and the CUDA 13.0 local repo already present: [192.168.10](https://192.168.10.2:8006/#v1:0:=lxc%2F108:4:=jsconsole:=contentRootdir::::=consolejs:) + +*** + +```bash +#!/usr/bin/env bash +# ============================================================ +# PVE NVIDIA RTX 3060 Driver Setup Script +# Tested on: Proxmox VE 9.1.6 | Kernel 6.14.11-5-pve +# NVIDIA Driver: 580.76.05 | CUDA: 13.0 +# ============================================================ +set -euo pipefail + +LOG=/root/pve-nvidia-install.log +exec > >(tee -a "$LOG") 2>&1 +echo "=== PVE NVIDIA Setup Started: $(date) ===" + +# ------------------------------------------------------------ +# STEP 1 — Environment audit (read-only) +# ------------------------------------------------------------ +echo "" +echo "--- Environment Info ---" +pveversion -v +uname -r +lspci | grep -i nvidia || echo "WARNING: No NVIDIA GPU detected" + +# ------------------------------------------------------------ +# STEP 2 — Blacklist nouveau +# ------------------------------------------------------------ +echo "" +echo "--- Blacklisting nouveau ---" +cat > /etc/modprobe.d/blacklist-nouveau.conf << 'EOF' +blacklist nouveau +options nouveau modeset=0 +EOF +echo "Written: /etc/modprobe.d/blacklist-nouveau.conf" + +# Rebuild initramfs so blacklist is baked in on next boot +update-initramfs -u -k all + +# ------------------------------------------------------------ +# STEP 3 — NVIDIA modprobe options (headless server, minimal) +# ------------------------------------------------------------ +echo "" +echo "--- Writing /etc/modprobe.d/nvidia.conf ---" +cat > /etc/modprobe.d/nvidia.conf << 'EOF' +# Headless PVE host - minimal NVIDIA options +options nvidia-drm modeset=1 +options nvidia NVreg_UsePageAttributeTable=1 +EOF + +# ------------------------------------------------------------ +# STEP 4 — Add NVIDIA modules to /etc/modules (load on boot) +# ------------------------------------------------------------ +echo "" +echo "--- Adding nvidia modules to /etc/modules ---" +if ! grep -q 'nvidia' /etc/modules; then + cat >> /etc/modules << 'EOF' +nvidia +nvidia_uvm +nvidia_drm +EOF + echo "Added nvidia, nvidia_uvm, nvidia_drm to /etc/modules" +else + echo "nvidia already in /etc/modules — skipped" +fi + +# ------------------------------------------------------------ +# STEP 5 — Install open kernel DKMS module +# (nvidia-kernel-open-dkms provides nvidia-kernel-580.65.06 +# which satisfies the nvidia-driver dependency) +# NOTE: Temporarily hides 6.17 build dir so DKMS skips it +# (NVIDIA 580.x is incompatible with kernel 6.17 API) +# ------------------------------------------------------------ +echo "" +echo "--- Installing nvidia-kernel-open-dkms ---" + +# Hide 6.17 kernel build dir so DKMS doesn't fail on it +HIDDEN_BUILD="" +if [ -L /lib/modules/6.17.13-1-pve/build ]; then + mv /lib/modules/6.17.13-1-pve/build \ + /lib/modules/6.17.13-1-pve/build.disabled + HIDDEN_BUILD="6.17.13-1-pve" + echo "Temporarily hidden: /lib/modules/6.17.13-1-pve/build" +fi + +DEBIAN_FRONTEND=noninteractive apt install -y nvidia-kernel-open-dkms + +# Restore the 6.17 build symlink immediately after +if [ -n "$HIDDEN_BUILD" ]; then + mv /lib/modules/${HIDDEN_BUILD}/build.disabled \ + /lib/modules/${HIDDEN_BUILD}/build + echo "Restored: /lib/modules/${HIDDEN_BUILD}/build" +fi + +# Fix any dpkg half-configured state (hide 6.17 again for dpkg --configure) +if [ -L /lib/modules/6.17.13-1-pve/build ]; then + mv /lib/modules/6.17.13-1-pve/build \ + /lib/modules/6.17.13-1-pve/build.disabled +fi +dpkg --configure -a 2>&1 || true +if [ -f /lib/modules/6.17.13-1-pve/build.disabled ]; then + mv /lib/modules/6.17.13-1-pve/build.disabled \ + /lib/modules/6.17.13-1-pve/build +fi + +# ------------------------------------------------------------ +# STEP 6 — Install full NVIDIA driver stack +# (brings nvidia-smi, libcuda1, nvidia-persistenced, +# nvidia-driver-cuda, and all userspace libs) +# ------------------------------------------------------------ +echo "" +echo "--- Installing nvidia-driver + nvidia-driver-cuda ---" +DEBIAN_FRONTEND=noninteractive apt install -y \ + nvidia-driver \ + nvidia-driver-cuda \ + nvidia-modprobe \ + nvidia-smi \ + libnvidia-sandboxutils + +# ------------------------------------------------------------ +# STEP 7 — DKMS exclusion for kernel 6.17.x +# Prevents future build failures when apt upgrades +# ------------------------------------------------------------ +echo "" +echo "--- Writing DKMS kernel exclusion for 6.17.x ---" +mkdir -p /etc/dkms +cat > /etc/dkms/nvidia-kernel-open.conf << 'EOF' +# NVIDIA 580.x is incompatible with kernel 6.17 API changes. +# Remove this file once a newer NVIDIA driver supporting 6.17 is available. +EXCLUDED_KERNEL_VERSIONS="6\.17\..*" +EOF +echo "Written: /etc/dkms/nvidia-kernel-open.conf" + +# ------------------------------------------------------------ +# STEP 8 — Create nvidia-modprobe device nodes now (no reboot needed) +# ------------------------------------------------------------ +echo "" +echo "--- Creating /dev/nvidia* device nodes ---" +nvidia-modprobe -u -c=0 || true + +# ------------------------------------------------------------ +# STEP 9 — Verify +# ------------------------------------------------------------ +echo "" +echo "--- Verification ---" +echo "=== lsmod | grep nvidia ===" +lsmod | grep -E '^nvidia' || echo "WARNING: No nvidia modules loaded (may need reboot)" + +echo "" +echo "=== /dev/nvidia* devices ===" +ls -la /dev/nvidia* /dev/nvidia-caps/* 2>/dev/null || echo "WARNING: No /dev/nvidia* found" + +echo "" +echo "=== DKMS status ===" +dkms status + +echo "" +echo "=== nvidia-smi ===" +nvidia-smi || echo "WARNING: nvidia-smi failed" + +echo "" +echo "=== Files modified ===" +echo " /etc/modprobe.d/blacklist-nouveau.conf" +echo " /etc/modprobe.d/nvidia.conf" +echo " /etc/modules (appended: nvidia, nvidia_uvm, nvidia_drm)" +echo " /etc/dkms/nvidia-kernel-open.conf" +echo " /boot/initrd.img-* (regenerated by update-initramfs)" + +echo "" +echo "=== PVE NVIDIA Setup Complete: $(date) ===" +echo "Log saved to: $LOG" +``` + +*** + +### How to use it on a fresh PVE node + +```bash +# 1. Save it +nano /root/pve-nvidia-setup.sh + +# 2. Make executable +chmod +x /root/pve-nvidia-setup.sh + +# 3. Run it +/root/pve-nvidia-setup.sh + +# 4. Check the log anytime +cat /root/pve-nvidia-install.log +``` + +### Prerequisites before running + +This script assumes the **CUDA 13.0 local repo** is already present at `/var/cuda-repo-debian12-13-0-local/` (as it was on your node). If starting completely fresh on a new PVE node, you'd first need to download and install the CUDA local repo `.deb` from NVIDIA, then run this script. + +### What the script does NOT do (by design) +- Does **not** touch any Proxmox packages or kernel +- Does **not** modify any LXC configs +- Does **not** install anything inside containers +- Does **not** require a reboot (modules were already loaded on your node) \ No newline at end of file