NVIDIA · elezar · Mar 24, 2026 · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026
@@ -596,6 +596,34 @@ pub async fn ensure_container(
         }
     }
 
+    // On Tegra platforms (Jetson) the NVIDIA container toolkit and CDI spec
+    // generation reads host-file injection config from
+    // /etc/nvidia-container-runtime/host-files-for-container.d on the host.
+    // Without this bind mount, the device plugin inside k3s cannot discover
+    // Tegra GPU devices and fails with "CDI options are only supported on
+    // NVML-based systems".
+    //
+    // We detect Tegra by querying the Docker daemon's kernel version (which
+    // works for both local and remote/SSH deploys) rather than checking the
+    // local filesystem.
+    if !device_ids.is_empty() {
+        let info = docker.info().await.into_diagnostic()?;
+        let is_tegra = info
+            .kernel_version
+            .as_deref()
+            .map_or(false, |k| k.contains("tegra"));
+        if is_tegra {
+            const HOST_FILES_DIR: &str = "/etc/nvidia-container-runtime/host-files-for-container.d";
+            tracing::info!(
+                kernel_version = info.kernel_version.as_deref().unwrap_or("unknown"),
+                "Detected Tegra platform, bind-mounting {HOST_FILES_DIR} for CDI spec generation"
+            );
+            let mut binds = host_config.binds.take().unwrap_or_default();
+            binds.push(format!("{HOST_FILES_DIR}:{HOST_FILES_DIR}:ro"));
+            host_config.binds = Some(binds);
+        }
+    }
+
     let mut cmd = vec![
         "server".to_string(),
         "--disable=traefik".to_string(),

@@ -414,7 +414,22 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
             target_os = "redox"
         )))]
         {
+            let cdi_gids = snapshot_cdi_gids();
             nix::unistd::initgroups(user_cstr.as_c_str(), group.gid).into_diagnostic()?;
+            if !cdi_gids.is_empty() {
+                let mut merged: Vec<nix::unistd::Gid> =
+                    nix::unistd::getgroups().unwrap_or_default();
+                for gid in &cdi_gids {
+                    if !merged.contains(gid) {
+                        merged.push(*gid);
+                    }
+                }
+                tracing::info!(
+                    gids = ?cdi_gids.iter().map(|g| g.as_raw()).collect::<Vec<_>>(),
+                    "Preserving CDI-injected supplementary GIDs across initgroups"
+                );
+                nix::unistd::setgroups(&merged).into_diagnostic()?;
+            }
         }
     }
 
@@ -458,6 +473,30 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
     Ok(())
 }
 
+/// Snapshot supplementary GIDs injected by the container runtime (e.g. via CDI
+/// `additionalGids`) before `initgroups` replaces them.
+///
+/// Only captures GIDs when GPU devices are present — on non-GPU sandboxes the
+/// runtime won't inject device-access GIDs so there is nothing to preserve.
+/// GID 0 (root) is always excluded to avoid inadvertent privilege retention.
+#[cfg(not(any(
+    target_os = "macos",
+    target_os = "ios",
+    target_os = "haiku",
+    target_os = "redox"
+)))]
+fn snapshot_cdi_gids() -> Vec<nix::unistd::Gid> {
+    if !std::path::Path::new("/dev/nvidiactl").exists() {
+        return Vec::new();
+    }
+    let root_gid = nix::unistd::Gid::from_raw(0);
+    nix::unistd::getgroups()
+        .unwrap_or_default()
+        .into_iter()
+        .filter(|&g| g != root_gid)
+        .collect()
+}
+
 /// Process exit status.
 #[derive(Debug, Clone, Copy)]
 pub struct ProcessStatus {

@@ -19,7 +19,7 @@ ARG K3S_VERSION=v1.35.2-k3s1
 ARG K3S_DIGEST=sha256:c3184157c3048112bab0c3e17405991da486cb3413511eba23f7650efd70776b
 ARG K9S_VERSION=v0.50.18
 ARG HELM_VERSION=v3.17.3
-ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1
+ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.19.0-1
 
 # ---------------------------------------------------------------------------
 # Shared Rust build stages

@@ -31,6 +31,9 @@ spec:
   targetNamespace: nvidia-device-plugin
   createNamespace: true
   valuesContent: |-
+    image:
+      repository: ghcr.io/nvidia/k8s-device-plugin
+      tag: "2ab68c16"
     runtimeClassName: nvidia
     deviceListStrategy: cdi-cri
     deviceIDStrategy: index

@@ -20,11 +20,13 @@ def test_gpu_sandbox_reports_available_gpu(
     sandbox: Callable[..., Sandbox],
     gpu_sandbox_spec: datamodel_pb2.SandboxSpec,
 ) -> None:
+    nvidia_smi_args = ["--query-gpu=name", "--format=csv,noheader"]
     with sandbox(spec=gpu_sandbox_spec, delete_on_exit=True) as sb:
-        result = sb.exec(
-            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
-            timeout_seconds=30,
-        )
+        result = sb.exec(["nvidia-smi", *nvidia_smi_args], timeout_seconds=30)
+        if result.exit_code != 0:
+            # On some platforms (e.g. Tegra/Jetson) nvidia-smi lives in
+            # /usr/sbin rather than /usr/bin and may not be on PATH.
+            result = sb.exec(["/usr/sbin/nvidia-smi", *nvidia_smi_args], timeout_seconds=30)
 
         assert result.exit_code == 0, result.stderr
         assert result.stdout.strip()