diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index e2fdae29846..fab8d68f9e5 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -55,6 +55,7 @@ jobs: ${{ (inputs.test-name == 'badaml-sandbox' && 'badaml-sandbox') || (inputs.test-name == 'badaml-vuln' && 'badaml-vuln') || + (inputs.test-name == 'runtime-rs-tmp' && 'runtime-rs') || 'base' }} steps: diff --git a/.github/workflows/e2e_manual.yml b/.github/workflows/e2e_manual.yml index d647bb44a5c..4e4b4344172 100644 --- a/.github/workflows/e2e_manual.yml +++ b/.github/workflows/e2e_manual.yml @@ -28,6 +28,7 @@ on: - policy - proxy - regression + - runtime-rs-tmp - servicemesh - vault - volumestatefulset diff --git a/.github/workflows/e2e_nightly.yml b/.github/workflows/e2e_nightly.yml index eec39a92c04..44d5854f7c4 100644 --- a/.github/workflows/e2e_nightly.yml +++ b/.github/workflows/e2e_nightly.yml @@ -67,6 +67,7 @@ jobs: - policy - proxy - regression + - runtime-rs-tmp - servicemesh - vault - volumestatefulset diff --git a/.github/workflows/e2e_runtime_rs.yml b/.github/workflows/e2e_runtime_rs.yml new file mode 100644 index 00000000000..c66d1b8e1f3 --- /dev/null +++ b/.github/workflows/e2e_runtime_rs.yml @@ -0,0 +1,48 @@ +name: e2e test runtime-rs + +on: + pull_request: + paths: + - overlays/sets/runtime-rs.nix + - packages/by-name/kata/runtime/** + - packages/by-name/kata/runtime-rs/** + - e2e/runtime-rs-tmp/** + +jobs: + tests: + strategy: + matrix: + platform: + - name: Metal-QEMU-SNP + runner: SNP + self-hosted: true + - name: Metal-QEMU-TDX + runner: TDX + self-hosted: true + - name: Metal-QEMU-SNP-GPU + runner: SNP-GPU + self-hosted: true + - name: Metal-QEMU-TDX-GPU + runner: TDX-GPU + self-hosted: true + test-name: + - runtime-rs-tmp + fail-fast: false + name: "${{ matrix.platform.name }}" + uses: ./.github/workflows/e2e.yml + with: + skip-undeploy: false + test-name: ${{ matrix.test-name }} + platform: ${{ matrix.platform.name }} + runner: ${{ matrix.platform.runner }} + self-hosted: ${{ matrix.platform.self-hosted }} + debug-shell: true + secrets: + GITHUB_TOKEN_IN: ${{ secrets.GITHUB_TOKEN }} + CACHIX_AUTH_TOKEN: ${{ secrets.CACHIX_AUTH_TOKEN }} + NUNKI_CI_COMMIT_PUSH_PR: ${{ secrets.NUNKI_CI_COMMIT_PUSH_PR }} + TEAMS_CI_WEBHOOK: ${{ secrets.TEAMS_CI_WEBHOOK }} + CONTRAST_GHCR_READ: ${{ secrets.CONTRAST_GHCR_READ }} + permissions: + contents: read + packages: write diff --git a/e2e/runtime-rs-tmp/runtimers_test.go b/e2e/runtime-rs-tmp/runtimers_test.go new file mode 100644 index 00000000000..dacc5bab626 --- /dev/null +++ b/e2e/runtime-rs-tmp/runtimers_test.go @@ -0,0 +1,54 @@ +// Copyright 2026 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +package runtimerstmp + +import ( + "context" + "flag" + "os" + "testing" + "time" + + "github.com/edgelesssys/contrast/e2e/internal/contrasttest" + "github.com/edgelesssys/contrast/internal/kuberesource" + "github.com/edgelesssys/contrast/internal/manifest" + "github.com/edgelesssys/contrast/internal/platforms" + "github.com/stretchr/testify/require" +) + +// TODO: remove when runtime-rs is fully integrated. +// Right now there are some failures left, so we only test that we can start up a container. +// Remove the test and use openssl and other tests when ready. + +func TestRuntimeRS(t *testing.T) { + platform, err := platforms.FromString(contrasttest.Flags.PlatformStr) + require.NoError(t, err) + + ct := contrasttest.New(t) + + require.True(t, contrasttest.Flags.InsecureEnableDebugShell, "the --insecure-enable-debug-shell-access flag must be set to true to extract the initrd start address") + + runtimeHandler, err := manifest.RuntimeHandler(platform) + require.NoError(t, err) + resources := kuberesource.CoordinatorBundle() + resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler) + resources = kuberesource.AddPortForwarders(resources) + ct.Init(t, resources) + + require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests") + require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests") + + // 'set' currently errors because of wrong measurements, but the debugshell init container should come up. + require.True(t, t.Run("wait for debugshell", func(t *testing.T) { + ctx, cancel := context.WithTimeout(t.Context(), ct.FactorPlatformTimeout(2*time.Minute)) + defer cancel() + require.NoError(t, ct.Kubeclient.WaitForContainer(ctx, ct.Namespace, "coordinator-0", "contrast-debug-shell")) + }), "debugshell start must succeed for subsequent tests") +} + +func TestMain(m *testing.M) { + contrasttest.RegisterFlags() + flag.Parse() + os.Exit(m.Run()) +} diff --git a/nodeinstaller/internal/kataconfig/config.go b/nodeinstaller/internal/kataconfig/config.go index 39627085c1c..d8c88a8eebe 100644 --- a/nodeinstaller/internal/kataconfig/config.go +++ b/nodeinstaller/internal/kataconfig/config.go @@ -4,7 +4,6 @@ package kataconfig import ( - _ "embed" "fmt" "path/filepath" @@ -12,22 +11,8 @@ import ( "github.com/pelletier/go-toml/v2" ) -var ( - // kataBareMetalQEMUTDXBaseConfig is the configuration file for the Kata runtime on bare-metal TDX - // with QEMU. - // - //go:embed configuration-qemu-tdx.toml - kataBareMetalQEMUTDXBaseConfig string - - // kataBareMetalQEMUSNPBaseConfig is the configuration file for the Kata runtime on bare-metal SNP - // with QEMU. - // - //go:embed configuration-qemu-snp.toml - kataBareMetalQEMUSNPBaseConfig string - - // RuntimeNamePlaceholder is the placeholder for the per-runtime path (i.e. /opt/edgeless/contrast-cc...) in the target file paths. - RuntimeNamePlaceholder = "@@runtimeName@@" -) +// RuntimeNamePlaceholder is the placeholder for the per-runtime path (i.e. /opt/edgeless/contrast-cc...) in the target file paths. +var RuntimeNamePlaceholder = "@@runtimeName@@" // KataRuntimeConfig returns the Kata runtime configuration. func KataRuntimeConfig( @@ -68,18 +53,6 @@ func KataRuntimeConfig( config.Agent["kata"]["debug_console_enabled"] = true config.Runtime["enable_debug"] = true } - // For larger images, we've been running into timeouts in e2e tests. - config.Agent["kata"]["dial_timeout"] = 120 - config.Runtime["create_container_timeout"] = 120 - // GPU-specific settings - if platforms.IsGPU(platform) { - config.Hypervisor["qemu"]["cold_plug_vfio"] = "root-port" - // GPU images tend to be larger, so give a better default timeout that - // allows for pulling those. - config.Agent["kata"]["dial_timeout"] = 600 - config.Runtime["create_container_timeout"] = 600 - config.Runtime["pod_resource_api_sock"] = "/var/lib/kubelet/pod-resources/kubelet.sock" - } // Use the resources installed by Contrast node-installer. config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") @@ -92,8 +65,6 @@ func KataRuntimeConfig( // TODO(katexochen): Remove after https://github.com/kata-containers/kata-containers/pull/12472 is merged. config.Hypervisor["qemu"]["disable_image_nvdimm"] = true - // Force container image gust pull so we don't have to use nydus-snapshotter. - config.Runtime["experimental_force_guest_pull"] = true // Replace the kernel params entirely (and don't append) since that's // also what we do when calculating the launch measurement. config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams @@ -105,11 +76,12 @@ func KataRuntimeConfig( // Fix and align guest memory calculation. config.Hypervisor["qemu"]["default_memory"] = platforms.DefaultMemoryInMebiBytes(platform) config.Runtime["sandbox_cgroup_only"] = true - // Currently not using the upstream encrypted emptyDir feature. - config.Runtime["emptydir_mode"] = "shared-fs" + // TODO: Check again why we need this and how we can avoid it. config.Hypervisor["qemu"]["block_device_aio"] = "threads" + config = extraRuntimeConfig(config, platform) + return &config, nil } @@ -117,11 +89,11 @@ func KataRuntimeConfig( // Source: https://github.com/kata-containers/kata-containers/blob/4029d154ba0c26fcf4a8f9371275f802e3ef522c/src/runtime/pkg/katautils/Config.go // This is a simplified version of the actual configuration. type Config struct { - Hypervisor map[string]hypervisorConfig - Agent map[string]agentConfig - Image imageConfig - Factory factoryConfig - Runtime runtimeConfig + Hypervisor map[string]hypervisorConfig `toml:"hypervisor"` + Agent map[string]agentConfig `toml:"agent"` + Image imageConfig `toml:"image"` + Factory factoryConfig `toml:"factory"` + Runtime runtimeConfig `toml:"runtime"` } // Marshal encodes the configuration as TOML. diff --git a/nodeinstaller/internal/kataconfig/config_test.go b/nodeinstaller/internal/kataconfig/config_test.go index 5a2a19d10dd..8f423268d88 100644 --- a/nodeinstaller/internal/kataconfig/config_test.go +++ b/nodeinstaller/internal/kataconfig/config_test.go @@ -13,17 +13,6 @@ import ( "github.com/stretchr/testify/require" ) -var ( - //go:embed testdata/expected-configuration-qemu-snp.toml - expectedConfMetalQEMUSNP []byte - //go:embed testdata/expected-configuration-qemu-tdx.toml - expectedConfMetalQEMUTDX []byte - //go:embed testdata/expected-configuration-qemu-snp-gpu.toml - expectedConfMetalQEMUSNPGPU []byte - //go:embed testdata/expected-configuration-qemu-tdx-gpu.toml - expectedConfMetalQEMUTDXGPU []byte -) - func TestKataRuntimeConfig(t *testing.T) { testCases := map[platforms.Platform]struct { changeSnpFields bool @@ -63,9 +52,9 @@ func TestKataRuntimeConfig(t *testing.T) { // section and no section are handled differently by Kata, so we make sure that this section is // always present. // It's covered by the comparison with testdata, but we want to keep this explicit. - assert.Contains(string(configBytes), "[Agent.kata]") - assert.Contains(string(configBytes), "[Runtime]") - assert.Contains(string(configBytes), "[Hypervisor.qemu]") + assert.Contains(string(configBytes), "[agent.kata]") + assert.Contains(string(configBytes), "[runtime]") + assert.Contains(string(configBytes), "[hypervisor.qemu]") }) } } diff --git a/nodeinstaller/internal/kataconfig/configuration-qemu-snp.toml b/nodeinstaller/internal/kataconfig/configuration-qemu-snp-go.toml similarity index 100% rename from nodeinstaller/internal/kataconfig/configuration-qemu-snp.toml rename to nodeinstaller/internal/kataconfig/configuration-qemu-snp-go.toml diff --git a/nodeinstaller/internal/kataconfig/configuration-qemu-snp-rs.toml b/nodeinstaller/internal/kataconfig/configuration-qemu-snp-rs.toml new file mode 100644 index 00000000000..941d1be997a --- /dev/null +++ b/nodeinstaller/internal/kataconfig/configuration-qemu-snp-rs.toml @@ -0,0 +1,778 @@ +# Copyright (c) 2017-2019 Intel Corporation +# Copyright (c) 2021 Adobe Inc. +# Copyright (c) 2024 IBM Corp. +# Copyright (c) 2025-2026 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "config/configuration-qemu-runtime-rs.toml.in" +# XXX: Project: +# XXX: Name: Kata Containers +# XXX: Type: kata + +[hypervisor.qemu] +path = "/opt/kata/bin/qemu-system-x86_64" +kernel = "/opt/kata/share/kata-containers/vmlinuz.container" +initrd = "/opt/kata/share/kata-containers/kata-containers-initrd-confidential.img" +# image = "/opt/kata/share/kata-containers/kata-containers-confidential.img" +machine_type = "q35" + +# Enable confidential guest support. +# Toggling that setting may trigger different hardware features, ranging +# from memory encryption to both memory and CPU-state encryption and integrity. +# The Kata Containers runtime dynamically detects the available feature set and +# aims at enabling the largest possible one, returning an error if none is +# available, or none is supported by the hypervisor. +# +# Known limitations: +# * Does not work by design: +# - CPU Hotplug +# - Memory Hotplug +# - NVDIMM devices +# +# Default false +confidential_guest = true + +# Enable AMD SEV-SNP confidential guests +# In case of using confidential guests on AMD hardware that supports SEV-SNP, +# the following enables SEV-SNP guests. Default true +sev_snp_guest = true + +# SNP 'ID Block' and 'ID Authentication Information Structure'. +# If one of snp_id_block or snp_id_auth is specified, the other must be specified, too. +# Notice that the default SNP policy of QEMU (0x30000) is used by Kata, if not explicitly +# set via 'snp_guest_policy' option. The IDBlock contains the guest policy as field, and +# it must match the value from 'snp_guest_policy' or, if unset, the QEMU default policy. +# +# 96-byte, base64-encoded blob to provide the ‘ID Block’ structure for the +# SNP_LAUNCH_FINISH command defined in the SEV-SNP firmware ABI (QEMU default: all-zero) +snp_id_block = "" +# 4096-byte, base64-encoded blob to provide the ‘ID Authentication Information Structure’ +# for the SNP_LAUNCH_FINISH command defined in the SEV-SNP firmware ABI (QEMU default: all-zero) +snp_id_auth = "" + +# SNP Guest Policy, the ‘POLICY’ parameter to the SNP_LAUNCH_START command. +# If unset, the QEMU default policy (0x30000) will be used. +# Notice that the guest policy is enforced at VM launch, and your pod VMs +# won't start at all if the policy denys it. This will be indicated by a +# 'SNP_LAUNCH_START' error. +snp_guest_policy = 196608 + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type = "ext4" + +# Block storage driver to be used for the VM rootfs is backed +# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm +vm_rootfs_driver = "virtio-blk-pci" + +# Enable running QEMU VMM as a non-root user. +# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as +# a non-root random user. See documentation for the limitations of this mode. +rootless = false + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = ["enable_iommu", "virtio_fs_extra_args", "kernel_params", "kernel_verity_params", "default_vcpus", "default_memory", "cc_init_data"] + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/opt/kata/bin/qemu-system-x86_64"] +valid_hypervisor_paths = ["/opt/kata/bin/qemu-system-x86_64"] + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "cgroup_no_v1=all systemd.unified_cgroup_hierarchy=1" + +# Path to the firmware. +# If you want that qemu uses the default firmware leave this option empty +firmware = "/opt/kata/share/ovmf/AMDSEV.fd" + +# Path to the firmware volume. +# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables +# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables +# can be customized per each user while UEFI code is kept same. +firmware_volume = "" + +# Machine accelerators +# comma-separated list of machine accelerators to pass to the hypervisor. +# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"` +machine_accelerators = "" + +# Qemu seccomp sandbox feature +# comma-separated list of seccomp sandbox features to control the syscall access. +# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"` +# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox +# Another note: enabling this feature may reduce performance, you may enable +# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html +# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny" +seccompsandbox = "" + +# CPU features +# comma-separated list of cpu features to pass to the cpu +# For example, `cpu_features = "pmu=off,vmx=off" +cpu_features = "pmu=off" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to 1 +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = 1 + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8. +default_maxvcpus = 0 + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in qemu or in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to 1 +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = 1 + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set 2048 MiB. +default_memory = 2048 + +# +# Default memory slots per SB/VM. +# If unspecified then it will be set 10. +# This is will determine the times that memory will be hotadded to sandbox/VM. +memory_slots = 10 + +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = 0 + +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM device. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +memory_offset = 0 + +# Specifies virtio-mem will be enabled or not. +# Please note that this option should be used with the command +# "echo 1 > /proc/sys/vm/overcommit_memory". +# Default false +enable_virtio_mem = false + +# Disable block device from being used for a container's rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# virtio-fs is used instead to pass the rootfs. +disable_block_device_use = false + +# Shared file system type: +# - virtio-fs (default) +# - virtio-fs-nydus +# - none +shared_fs = "none" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "/opt/kata/libexec/virtiofsd" + +# List of valid annotations values for the virtiofs daemon +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/opt/kata/libexec/virtiofsd"] +valid_virtio_fs_daemon_paths = ["/opt/kata/libexec/virtiofsd"] + +# Default size of DAX cache in MiB +virtio_fs_cache_size = 0 + +# Default size of virtqueues +virtio_fs_queue_size = 1024 + +# Extra args for virtiofsd daemon +# +# Format example: +# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = ["--thread-pool-size=1", "-o", "announce_submounts"] + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - metadata +# Metadata and pathname lookup are cached in guest and never expire. +# Data is never cached in guest. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "auto" + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. +block_device_driver = "virtio-scsi" + +# aio is the I/O mechanism used by qemu +# Options: +# +# - threads +# Pthread based disk I/O. +# +# - native +# Native Linux I/O. +# +# - io_uring +# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and +# qemu >=5.0. +block_device_aio = "io_uring" + +# Specifies cache-related options will be set to block devices or not. +# Default false +block_device_cache_set = false + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +block_device_cache_direct = false + +# Specifies cache-related options for block devices. +# Denotes whether flush requests for the device are ignored. +# Default false +block_device_cache_noflush = false + +# Enable iothreads (data-plane) to be used. This causes IO to be +# handled in a separate IO thread. This is currently only implemented +# for SCSI. +# +enable_iothreads = false + +# Independent IOThreads enables IO to be processed in a separate thread, it is +# for QEMU hotplug device attach to iothread, like virtio-blk. +indep_iothreads = 0 + +# Enable pre allocation of VM RAM, default false +# Enabling this will result in lower container density +# as all of the memory will be allocated and locked +# This is useful when you want to reserve all the memory +# upfront or in the cases where you want memory latencies +# to be very predictable +# Default false +enable_mem_prealloc = false + +# Reclaim guest freed memory. +# Enabling this will result in the VM balloon device having f_reporting=on set. +# Then the hypervisor will use it to reclaim guest freed memory. +# This is useful for reducing the amount of memory used by a VM. +# Enabling this feature may sometimes reduce the speed of memory access in +# the VM. +# +# Default false +reclaim_guest_freed_memory = false + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +enable_hugepages = false + +# Enable vhost-user storage device, default false +# Enabling this will result in some Linux reserved block type +# major range 240-254 being chosen to represent vhost-user devices. +enable_vhost_user_store = false + +# The base directory specifically used for vhost-user devices. +# Its sub-path "block" is used for block devices; "block/sockets" is +# where we expect vhost-user sockets to live; "block/devices" is where +# simulated block device nodes for vhost-user devices to live. +vhost_user_store_path = "/var/run/kata-containers/vhost-user" + +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: intel_iommu=on,iommu=pt +enable_iommu = false + +# Enable IOMMU_PLATFORM, default false +# Enabling this will result in the VM device having iommu_platform=on set +enable_iommu_platform = false + +# List of valid annotations values for the vhost user store path +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/var/run/kata-containers/vhost-user"] +valid_vhost_user_store_paths = ["/var/run/kata-containers/vhost-user"] + +# The timeout for reconnecting on non-server spdk sockets when the remote end goes away. +# qemu will delay this many seconds and then attempt to reconnect. +# Zero disables reconnecting, and the default is zero. +vhost_user_reconnect_timeout_sec = 0 + +# Enable file based guest memory support. The default is an empty string which +# will disable this feature. In the case of virtio-fs, this is enabled +# automatically and '/dev/shm' is used as the backing folder. +# This option will be ignored if VM templating is enabled. +file_mem_backend = "" + +# List of valid annotations values for the file_mem_backend annotation +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: [""] +valid_file_mem_backends = [""] + +# -pflash can add image file to VM. The arguments of it should be in format +# of ["/path/to/flash0.img", "/path/to/flash1.img"] +pflashes = [] + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. And Debug also enable the hmp socket. +# +# Default false +enable_debug = false + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +disable_nesting_checks = true + +# If false and nvdimm is supported, use nvdimm device to plug guest image. +# Otherwise virtio-block device is used. +# +# nvdimm is not supported when `confidential_guest = true`. +disable_image_nvdimm = true + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means the number of pcie_root_port +# Default 0 +pcie_root_port = 0 + +# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off +# security (vhost-net runs ring0) for network I/O performance. +disable_vhost_net = false + +# This option allows to add an extra HMP or QMP socket when `enable_debug = true` +# +# WARNING: Anyone with access to the extra socket can take full control of +# Qemu. This is for debugging purpose only and must *NEVER* be used in +# production. +# +# Valid values are : +# - "hmp" +# - "qmp" +# - "qmp-pretty" (same as "qmp" with pretty json formatting) +# +# If set to the empty string "", no extra monitor socket is added. This is +# the default. +#extra_monitor_socket = "hmp" + +# +# Default entropy source. +# The path to a host source of entropy (including a real hardware RNG) +# /dev/urandom and /dev/random are two main options. +# Be aware that /dev/random is a blocking source of entropy. If the host +# runs out of entropy, the VMs boot time will increase leading to get startup +# timeouts. +# The source of entropy /dev/urandom is non-blocking and provides a +# generally acceptable source of entropy. It should work well for pretty much +# all practical purposes. +entropy_source = "/dev/urandom" + +# List of valid annotations values for entropy_source +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/dev/urandom","/dev/random",""] +valid_entropy_sources = ["/dev/urandom","/dev/random",""] + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +# Recommended value when enabling: "/usr/share/oci/hooks" +guest_hook_path = "" + +# +# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM). +# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic. +# Default 0-sized value means unlimited rate. +rx_rate_limiter_max_rate = 0 +# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM). +# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block) +# to discipline traffic. +# Default 0-sized value means unlimited rate. +tx_rate_limiter_max_rate = 0 +# network_queues configures the number of virtio-net queue pairs (RX/TX) exposed to the guest. +# Setting network_queues = N creates N RX queues and N TX queues (i.e., N queue pairs). +# More queues can improve network throughput and reduce per-queue contention by allowing packet processing to scale +# across multiple vCPUs/threads (subject to host/guest capabilities and backend configuration such as vhost-net). +# Increasing this value consumes more resources (e.g., virtqueue state, interrupts/MSI-X vectors, backend threads), +# so it should typically not exceed the number of vCPUs or the practical parallelism of the networking backend. +# Default: 1, Range: 1..=256 +network_queues = 1 + +# Set where to save the guest memory dump file. +# If set, when GUEST_PANICKED event occurred, +# guest memeory will be dumped to host filesystem under guest_memory_dump_path, +# This directory will be created automatically if it does not exist. +# +# The dumped file(also called vmcore) can be processed with crash or gdb. +# +# WARNING: +# Dump guest's memory can take very long depending on the amount of guest memory +# and use much disk space. +# Recommended value when enabling: "/var/crash/kata" +guest_memory_dump_path = "" + +# If enable paging. +# Basically, if you want to use "gdb" rather than "crash", +# or need the guest-virtual addresses in the ELF vmcore, +# then you should enable paging. +# +# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details +guest_memory_dump_paging = false + +# Enable swap in the guest. Default false. +# When enable_guest_swap is enabled, insert a raw file to the guest as the swap device +# if the swappiness of a container (set by annotation "io.katacontainers.container.resource.swappiness") +# is bigger than 0. +# The size of the swap device should be +# swap_in_bytes (set by annotation "io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes. +# If swap_in_bytes is not set, the size should be memory_limit_in_bytes. +# If swap_in_bytes and memory_limit_in_bytes is not set, the size should +# be default_memory. +enable_guest_swap = false + +# use legacy serial for guest console if available and implemented for architecture. Default false +use_legacy_serial = false + +# disable applying SELinux on the VMM process (default false) +disable_selinux = false + +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux = true + + +[factory] +# VM templating support. Once enabled, new VMs are created from template +# using vm cloning. They will share the same initial kernel, initramfs and +# agent memory by mapping it readonly. It helps speeding up new container +# creation and saves a lot of memory if there are many kata containers running +# on the same host. +# +# When disabled, new VMs are created from scratch. +# +# Note: Requires "initrd=" to be set ("image=" is not supported). +# +# Default false +enable_template = false + +# Specifies the path of template. +# +# Default "/run/vc/vm/template" +template_path = "/run/vc/vm/template" + +# The number of caches of VMCache: +# unspecified or == 0 --> VMCache is disabled +# > 0 --> will be set to the specified number +# +# VMCache is a function that creates VMs as caches before using it. +# It helps speed up new container creation. +# The function consists of a server and some clients communicating +# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto. +# The VMCache server will create some VMs and cache them by factory cache. +# It will convert the VM to gRPC format and transport it when gets +# requestion from clients. +# Factory grpccache is the VMCache client. It will request gRPC format +# VM and convert it back to a VM. If VMCache function is enabled, +# kata-runtime will request VM from factory grpccache when it creates +# a new sandbox. +# +# Default 0 +vm_cache_number = 0 + +# Specify the address of the Unix socket that is used by VMCache. +# +# Default /var/run/kata-containers/cache.sock +vm_cache_endpoint = "/var/run/kata-containers/cache.sock" + +[agent.kata] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +enable_debug = false + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +enable_tracing = false + +# Comma separated list of kernel modules and their parameters. +# These modules will be loaded in the guest kernel using modprobe(8). +# The following example can be used to load two kernel modules with parameters +# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] +# The first word is considered as the module name and the rest as its parameters. +# Container will not be started when: +# * A kernel module is specified and the modprobe command is not installed in the guest +# or it fails loading the module. +# * The module is not available in the guest or it doesn't met the guest kernel +# requirements, like architecture and version. +# +kernel_modules = [] + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command +debug_console_enabled = false + +# Agent dial timeout in millisecond. +# (default: 10) +dial_timeout_ms = 10 + +# Agent reconnect timeout in millisecond. +# Retry times = reconnect_timeout_ms / dial_timeout_ms (default: 300) +# If you find pod cannot connect to the agent when starting, please +# consider increasing this value to increase the retry times. +# You'd better not change the value of dial_timeout_ms, unless you have an +# idea of what you are doing. +# (default: 3000) +reconnect_timeout_ms = 3000 + +# Create Container Request Timeout +# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest. +# It's also used to ensure that workloads, especially those involving large image pulls within the guest, +# have sufficient time to complete. +# +# Effective Timeout Determination: +# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values: +# - create_container_timeout: The timeout value configured for creating containers (default: 30,000 milliseconds). +# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below: +# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout) +# Defaults to 60 second(s) +create_container_timeout = 60 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +enable_debug = false +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model="tcfilter" + +name="virt_container" +hypervisor_name="qemu" +agent_name="kata" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp = true + +# vCPUs pinning settings +# if enabled, each vCPU thread will be scheduled to a fixed CPU +# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet) +enable_vcpus_pinning = false + +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +guest_selinux_label = "" + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +enable_tracing = false + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +disable_new_netns = false + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only = false + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt = true + +# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. +# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. +# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` +# These will not be exposed to the container workloads, and are only provided for potential guest services. +sandbox_bind_mounts = [] + +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode = "guest-kernel" + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir = false + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# for example: +# experimental=["force_guest_pull"] +# which is for enable force_guest_pull mode in CoCo scenarios. +# (default: []) +experimental = [] + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +enable_pprof = false + +# Base directory of directly attachable network config. +# Network devices for VM-based containers are allowed to be placed in the +# host netns to eliminate as many hops as possible, which is what we +# called a "Directly Attachable Network". The config, set by special CNI +# plugins, is used to tell the Kata containers what devices are attached +# to the hypervisor. +# (default: /run/kata-containers/dans) +dan_conf = "/run/kata-containers/dans" + +# pod_resource_api_sock specifies the unix socket for the Kubelet's +# PodResource API endpoint. If empty, kubernetes based cold plug +# will not be attempted. In order for this feature to work, the +# KubeletPodResourcesGet featureGate must be enabled in Kubelet, +# if using Kubelet older than 1.34. +# +# The pod resource API's socket is relative to the Kubelet's root-dir, +# which is defined by the cluster admin, and its location is: +# ${KubeletRootDir}/pod-resources/kubelet.sock +# +# cold_plug_vfio(see hypervisor config) acts as a feature gate: +# cold_plug_vfio = no_port (default) => no cold plug +# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need +# explicit CDI annotation for cold plug (applies mainly +# to non-k8s cases) +# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet +# based cold plug. +pod_resource_api_sock = "" diff --git a/nodeinstaller/internal/kataconfig/configuration-qemu-tdx.toml b/nodeinstaller/internal/kataconfig/configuration-qemu-tdx-go.toml similarity index 100% rename from nodeinstaller/internal/kataconfig/configuration-qemu-tdx.toml rename to nodeinstaller/internal/kataconfig/configuration-qemu-tdx-go.toml diff --git a/nodeinstaller/internal/kataconfig/configuration-qemu-tdx-rs.toml b/nodeinstaller/internal/kataconfig/configuration-qemu-tdx-rs.toml new file mode 100644 index 00000000000..5dbd80ec6ca --- /dev/null +++ b/nodeinstaller/internal/kataconfig/configuration-qemu-tdx-rs.toml @@ -0,0 +1,759 @@ +# Copyright (c) 2017-2019 Intel Corporation +# Copyright (c) 2021 Adobe Inc. +# Copyright (c) 2025-2026 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "config/configuration-qemu-runtime-rs.toml.in" +# XXX: Project: +# XXX: Name: Kata Containers +# XXX: Type: kata + +[hypervisor.qemu] +path = "/opt/kata/bin/qemu-system-x86_64" +kernel = "/opt/kata/share/kata-containers/vmlinuz.container" +image = "/opt/kata/share/kata-containers/kata-containers-confidential.img" +# initrd = "/opt/kata/share/kata-containers/kata-containers-initrd.img" +machine_type = "q35" +tdx_quote_generation_service_socket_port = 4050 + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type = "ext4" + +# Block storage driver to be used for the VM rootfs is backed +# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm +vm_rootfs_driver = "virtio-blk-pci" + +# Enable confidential guest support. +# Toggling that setting may trigger different hardware features, ranging +# from memory encryption to both memory and CPU-state encryption and integrity. +# The Kata Containers runtime dynamically detects the available feature set and +# aims at enabling the largest possible one, returning an error if none is +# available, or none is supported by the hypervisor. +# +# Known limitations: +# * Does not work by design: +# - CPU Hotplug +# - Memory Hotplug +# - NVDIMM devices +# +# Default false +confidential_guest = true + +# Enable running QEMU VMM as a non-root user. +# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as +# a non-root random user. See documentation for the limitations of this mode. +rootless = false + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = ["enable_iommu", "virtio_fs_extra_args", "kernel_params", "kernel_verity_params", "default_vcpus", "default_memory", "cc_init_data"] + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/opt/kata/bin/qemu-system-x86_64"] +valid_hypervisor_paths = ["/opt/kata/bin/qemu-system-x86_64"] + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "cgroup_no_v1=all systemd.unified_cgroup_hierarchy=1" + +# Optional dm-verity parameters (comma-separated key=value list): +# root_hash=...,salt=...,data_blocks=...,data_block_size=...,hash_block_size=... +# These are used by the runtime to assemble dm-verity kernel params. +kernel_verity_params = "root_hash=8dc14a3f32209aaa16929c56db29854d57005d4746d9c595c45b47439f0a154d,salt=663ac30775eab9b1ad905769f6e05cc5f774179557f9ff683870dbba144cbe03,data_blocks=64000,data_block_size=4096,hash_block_size=4096" + +# Path to the firmware. +# If you want that qemu uses the default firmware leave this option empty +firmware = "/opt/kata/share/ovmf/OVMF.inteltdx.fd" + +# Path to the firmware volume. +# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables +# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables +# can be customized per each user while UEFI code is kept same. +firmware_volume = "" + +# Machine accelerators +# comma-separated list of machine accelerators to pass to the hypervisor. +# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"` +machine_accelerators = "" + +# Qemu seccomp sandbox feature +# comma-separated list of seccomp sandbox features to control the syscall access. +# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"` +# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox +# Another note: enabling this feature may reduce performance, you may enable +# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html +# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny" +seccompsandbox = "" + +# CPU features +# comma-separated list of cpu features to pass to the cpu +# For example, `cpu_features = "pmu=off,vmx=off" +cpu_features = "pmu=off" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to 1 +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = 1 + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8. +default_maxvcpus = 0 + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in qemu or in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to 1 +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = 1 + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set 2048 MiB. +default_memory = 2048 +# +# Default memory slots per SB/VM. +# If unspecified then it will be set 10. +# This is will determine the times that memory will be hotadded to sandbox/VM. +memory_slots = 10 + +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = 0 + +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM device. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +memory_offset = 0 + +# Specifies virtio-mem will be enabled or not. +# Please note that this option should be used with the command +# "echo 1 > /proc/sys/vm/overcommit_memory". +# Default false +enable_virtio_mem = false + +# Disable block device from being used for a container's rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# virtio-fs is used instead to pass the rootfs. +disable_block_device_use = false + +# Shared file system type: +# - virtio-fs (default) +# - virtio-fs-nydus +# - none +shared_fs = "none" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "/opt/kata/libexec/virtiofsd" + +# List of valid annotations values for the virtiofs daemon +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/opt/kata/libexec/virtiofsd"] +valid_virtio_fs_daemon_paths = ["/opt/kata/libexec/virtiofsd"] + +# Default size of DAX cache in MiB +virtio_fs_cache_size = 0 + +# Default size of virtqueues +virtio_fs_queue_size = 1024 + +# Extra args for virtiofsd daemon +# +# Format example: +# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = ["--thread-pool-size=1", "-o", "announce_submounts"] + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - metadata +# Metadata and pathname lookup are cached in guest and never expire. +# Data is never cached in guest. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "auto" + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. +block_device_driver = "virtio-scsi" + +# aio is the I/O mechanism used by qemu +# Options: +# +# - threads +# Pthread based disk I/O. +# +# - native +# Native Linux I/O. +# +# - io_uring +# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and +# qemu >=5.0. +block_device_aio = "io_uring" + +# Specifies cache-related options will be set to block devices or not. +# Default false +block_device_cache_set = false + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +block_device_cache_direct = false + +# Specifies cache-related options for block devices. +# Denotes whether flush requests for the device are ignored. +# Default false +block_device_cache_noflush = false + +# Enable iothreads (data-plane) to be used. This causes IO to be +# handled in a separate IO thread. This is currently implemented +# for virtio-scsi and virtio-blk. +# +enable_iothreads = false + +# Independent IOThreads enables IO to be processed in a separate thread, it is +# for QEMU hotplug device attach to iothread, like virtio-blk. +indep_iothreads = 0 + +# Enable pre allocation of VM RAM, default false +# Enabling this will result in lower container density +# as all of the memory will be allocated and locked +# This is useful when you want to reserve all the memory +# upfront or in the cases where you want memory latencies +# to be very predictable +# Default false +enable_mem_prealloc = false + +# Reclaim guest freed memory. +# Enabling this will result in the VM balloon device having f_reporting=on set. +# Then the hypervisor will use it to reclaim guest freed memory. +# This is useful for reducing the amount of memory used by a VM. +# Enabling this feature may sometimes reduce the speed of memory access in +# the VM. +# +# Default false +reclaim_guest_freed_memory = false + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +enable_hugepages = false + +# Enable vhost-user storage device, default false +# Enabling this will result in some Linux reserved block type +# major range 240-254 being chosen to represent vhost-user devices. +enable_vhost_user_store = false + +# The base directory specifically used for vhost-user devices. +# Its sub-path "block" is used for block devices; "block/sockets" is +# where we expect vhost-user sockets to live; "block/devices" is where +# simulated block device nodes for vhost-user devices to live. +vhost_user_store_path = "/var/run/kata-containers/vhost-user" + +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: intel_iommu=on,iommu=pt +enable_iommu = false + +# Enable IOMMU_PLATFORM, default false +# Enabling this will result in the VM device having iommu_platform=on set +enable_iommu_platform = false + +# List of valid annotations values for the vhost user store path +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/var/run/kata-containers/vhost-user"] +valid_vhost_user_store_paths = ["/var/run/kata-containers/vhost-user"] + +# The timeout for reconnecting on non-server spdk sockets when the remote end goes away. +# qemu will delay this many seconds and then attempt to reconnect. +# Zero disables reconnecting, and the default is zero. +vhost_user_reconnect_timeout_sec = 0 + +# Enable file based guest memory support. The default is an empty string which +# will disable this feature. In the case of virtio-fs, this is enabled +# automatically and '/dev/shm' is used as the backing folder. +# This option will be ignored if VM templating is enabled. +file_mem_backend = "" + +# List of valid annotations values for the file_mem_backend annotation +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: [""] +valid_file_mem_backends = [""] + +# -pflash can add image file to VM. The arguments of it should be in format +# of ["/path/to/flash0.img", "/path/to/flash1.img"] +pflashes = [] + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. And Debug also enable the hmp socket. +# +# Default false +enable_debug = false + +# This option allows to add an extra HMP or QMP socket when `enable_debug = true` +# +# WARNING: Anyone with access to the extra socket can take full control of +# Qemu. This is for debugging purpose only and must *NEVER* be used in +# production. +# +# Valid values are : +# - "hmp" +# - "qmp" +# - "qmp-pretty" (same as "qmp" with pretty json formatting) +# +# If set to the empty string "", no extra monitor socket is added. This is +# the default. +extra_monitor_socket = "" + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +disable_nesting_checks = false + +# If false and nvdimm is supported, use nvdimm device to plug guest image. +# Otherwise virtio-block device is used. +# +# nvdimm is not supported when `confidential_guest = true`. +disable_image_nvdimm = true + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means the number of pcie_root_port +# Default 0 +pcie_root_port = 0 + +# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off +# security (vhost-net runs ring0) for network I/O performance. +disable_vhost_net = false + +# +# Default entropy source. +# The path to a host source of entropy (including a real hardware RNG) +# /dev/urandom and /dev/random are two main options. +# Be aware that /dev/random is a blocking source of entropy. If the host +# runs out of entropy, the VMs boot time will increase leading to get startup +# timeouts. +# The source of entropy /dev/urandom is non-blocking and provides a +# generally acceptable source of entropy. It should work well for pretty much +# all practical purposes. +entropy_source = "/dev/urandom" + + +# List of valid annotations values for entropy_source +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: ["/dev/urandom","/dev/random",""] +valid_entropy_sources = ["/dev/urandom","/dev/random",""] + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +# Recommended value when enabling: "/usr/share/oci/hooks" +guest_hook_path = "" +# +# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM). +# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic. +# Default 0-sized value means unlimited rate. +rx_rate_limiter_max_rate = 0 +# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM). +# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block) +# to discipline traffic. +# Default 0-sized value means unlimited rate. +tx_rate_limiter_max_rate = 0 +# network_queues configures the number of virtio-net queue pairs (RX/TX) exposed to the guest. +# Setting network_queues = N creates N RX queues and N TX queues (i.e., N queue pairs). +# More queues can improve network throughput and reduce per-queue contention by allowing packet processing to scale +# across multiple vCPUs/threads (subject to host/guest capabilities and backend configuration such as vhost-net). +# Increasing this value consumes more resources (e.g., virtqueue state, interrupts/MSI-X vectors, backend threads), +# so it should typically not exceed the number of vCPUs or the practical parallelism of the networking backend. +# Default: 1, Range: 1..=256 +network_queues = 1 + +# Set where to save the guest memory dump file. +# If set, when GUEST_PANICKED event occurred, +# guest memeory will be dumped to host filesystem under guest_memory_dump_path, +# This directory will be created automatically if it does not exist. +# +# The dumped file(also called vmcore) can be processed with crash or gdb. +# +# WARNING: +# Dump guest's memory can take very long depending on the amount of guest memory +# and use much disk space. +# Recommended value when enabling: "/var/crash/kata" +guest_memory_dump_path = "" + +# If enable paging. +# Basically, if you want to use "gdb" rather than "crash", +# or need the guest-virtual addresses in the ELF vmcore, +# then you should enable paging. +# +# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details +guest_memory_dump_paging = false + +# Enable swap in the guest. Default false. +# When enable_guest_swap is enabled, insert a raw file to the guest as the swap device +# if the swappiness of a container (set by annotation "io.katacontainers.container.resource.swappiness") +# is bigger than 0. +# The size of the swap device should be +# swap_in_bytes (set by annotation "io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes. +# If swap_in_bytes is not set, the size should be memory_limit_in_bytes. +# If swap_in_bytes and memory_limit_in_bytes is not set, the size should +# be default_memory. +enable_guest_swap = false + +# use legacy serial for guest console if available and implemented for architecture. Default false +use_legacy_serial = false + +# disable applying SELinux on the VMM process (default false) +disable_selinux = false + +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux = true + + +[factory] +# VM templating support. Once enabled, new VMs are created from template +# using vm cloning. They will share the same initial kernel, initramfs and +# agent memory by mapping it readonly. It helps speeding up new container +# creation and saves a lot of memory if there are many kata containers running +# on the same host. +# +# When disabled, new VMs are created from scratch. +# +# Note: Requires "initrd=" to be set ("image=" is not supported). +# +# Default false +enable_template = false + +# Specifies the path of template. +# +# Default "/run/vc/vm/template" +template_path = "/run/vc/vm/template" + +# The number of caches of VMCache: +# unspecified or == 0 --> VMCache is disabled +# > 0 --> will be set to the specified number +# +# VMCache is a function that creates VMs as caches before using it. +# It helps speed up new container creation. +# The function consists of a server and some clients communicating +# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto. +# The VMCache server will create some VMs and cache them by factory cache. +# It will convert the VM to gRPC format and transport it when gets +# requestion from clients. +# Factory grpccache is the VMCache client. It will request gRPC format +# VM and convert it back to a VM. If VMCache function is enabled, +# kata-runtime will request VM from factory grpccache when it creates +# a new sandbox. +# +# Default 0 +vm_cache_number = 0 + +# Specify the address of the Unix socket that is used by VMCache. +# +# Default /var/run/kata-containers/cache.sock +vm_cache_endpoint = "/var/run/kata-containers/cache.sock" + +[agent.kata] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +enable_debug = false + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +enable_tracing = false + +# Comma separated list of kernel modules and their parameters. +# These modules will be loaded in the guest kernel using modprobe(8). +# The following example can be used to load two kernel modules with parameters +# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] +# The first word is considered as the module name and the rest as its parameters. +# Container will not be started when: +# * A kernel module is specified and the modprobe command is not installed in the guest +# or it fails loading the module. +# * The module is not available in the guest or it doesn't met the guest kernel +# requirements, like architecture and version. +# +kernel_modules = [] + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +debug_console_enabled = false + +# Agent dial timeout in millisecond. +# (default: 10) +dial_timeout_ms = 10 + +# Agent reconnect timeout in millisecond. +# Retry times = reconnect_timeout_ms / dial_timeout_ms (default: 300) +# If you find pod cannot connect to the agent when starting, please +# consider increasing this value to increase the retry times. +# You'd better not change the value of dial_timeout_ms, unless you have an +# idea of what you are doing. +# (default: 3000) +reconnect_timeout_ms = 3000 + +# Create Container Request Timeout +# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest. +# It's also used to ensure that workloads, especially those involving large image pulls within the guest, +# have sufficient time to complete. +# +# Effective Timeout Determination: +# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values: +# - create_container_timeout: The timeout value configured for creating containers (default: 30,000 milliseconds). +# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below: +# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout) +# Defaults to 60 second(s) +create_container_timeout = 60 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +enable_debug = false +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model = "tcfilter" + +name="virt_container" +hypervisor_name="qemu" +agent_name="kata" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp = true + +# vCPUs pinning settings +# if enabled, each vCPU thread will be scheduled to a fixed CPU +# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet) +enable_vcpus_pinning = false + +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +# Example value when enabling: "system_u:system_r:container_t" +guest_selinux_label = "" + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +enable_tracing = false + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +disable_new_netns = false + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only = false + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt = true + +# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. +# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. +# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` +# These will not be exposed to the container workloads, and are only provided for potential guest services. +sandbox_bind_mounts = [] + +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode = "guest-kernel" + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir = false + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# for example: +# experimental=["force_guest_pull"] +# which is for enable force_guest_pull mode in CoCo scenarios. +# (default: []) +experimental = [] + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +enable_pprof = false + +# Base directory of directly attachable network config. +# Network devices for VM-based containers are allowed to be placed in the +# host netns to eliminate as many hops as possible, which is what we +# called a "Directly Attachable Network". The config, set by special CNI +# plugins, is used to tell the Kata containers what devices are attached +# to the hypervisor. +# (default: /run/kata-containers/dans) +dan_conf = "/run/kata-containers/dans" + +# pod_resource_api_sock specifies the unix socket for the Kubelet's +# PodResource API endpoint. If empty, kubernetes based cold plug +# will not be attempted. In order for this feature to work, the +# KubeletPodResourcesGet featureGate must be enabled in Kubelet, +# if using Kubelet older than 1.34. +# +# The pod resource API's socket is relative to the Kubelet's root-dir, +# which is defined by the cluster admin, and its location is: +# ${KubeletRootDir}/pod-resources/kubelet.sock +# +# cold_plug_vfio(see hypervisor config) acts as a feature gate: +# cold_plug_vfio = no_port (default) => no cold plug +# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need +# explicit CDI annotation for cold plug (applies mainly +# to non-k8s cases) +# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet +# based cold plug. +pod_resource_api_sock = "" diff --git a/nodeinstaller/internal/kataconfig/runtime_go.go b/nodeinstaller/internal/kataconfig/runtime_go.go new file mode 100644 index 00000000000..90584acf1b2 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/runtime_go.go @@ -0,0 +1,47 @@ +// Copyright 2026 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +//go:build !runtimers + +package kataconfig + +import ( + _ "embed" + + "github.com/edgelesssys/contrast/internal/platforms" +) + +var ( + // kataBareMetalQEMUTDXBaseConfig is the configuration file for the Kata runtime on bare-metal TDX + // with QEMU. + // + //go:embed configuration-qemu-tdx-go.toml + kataBareMetalQEMUTDXBaseConfig string + // kataBareMetalQEMUSNPBaseConfig is the configuration file for the Kata runtime on bare-metal SNP + // with QEMU. + // + //go:embed configuration-qemu-snp-go.toml + kataBareMetalQEMUSNPBaseConfig string +) + +func extraRuntimeConfig(config Config, platform platforms.Platform) Config { + // Currently not using the upstream encrypted emptyDir feature. + config.Runtime["emptydir_mode"] = "shared-fs" + // For larger images, we've been running into timeouts in e2e tests. + config.Runtime["create_container_timeout"] = 120 + // Force container image gust pull so we don't have to use nydus-snapshotter. + config.Runtime["experimental_force_guest_pull"] = true + + config.Agent["kata"]["dial_timeout"] = 120 + + if platforms.IsGPU(platform) { + // GPU images tend to be larger, so give a better default timeout that + // allows for pulling those. + config.Agent["kata"]["dial_timeout"] = 600 + config.Runtime["create_container_timeout"] = 600 + config.Hypervisor["qemu"]["cold_plug_vfio"] = "root-port" + config.Runtime["pod_resource_api_sock"] = "/var/lib/kubelet/pod-resources/kubelet.sock" + } + + return config +} diff --git a/nodeinstaller/internal/kataconfig/runtime_go_test.go b/nodeinstaller/internal/kataconfig/runtime_go_test.go new file mode 100644 index 00000000000..075893c7992 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/runtime_go_test.go @@ -0,0 +1,19 @@ +// Copyright 2024 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +//go:build !runtimers + +package kataconfig_test + +import _ "embed" + +var ( + //go:embed testdata/runtime-go/expected-configuration-qemu-snp.toml + expectedConfMetalQEMUSNP []byte + //go:embed testdata/runtime-go/expected-configuration-qemu-tdx.toml + expectedConfMetalQEMUTDX []byte + //go:embed testdata/runtime-go/expected-configuration-qemu-snp-gpu.toml + expectedConfMetalQEMUSNPGPU []byte + //go:embed testdata/runtime-go/expected-configuration-qemu-tdx-gpu.toml + expectedConfMetalQEMUTDXGPU []byte +) diff --git a/nodeinstaller/internal/kataconfig/runtime_rs.go b/nodeinstaller/internal/kataconfig/runtime_rs.go new file mode 100644 index 00000000000..6f67cd3107e --- /dev/null +++ b/nodeinstaller/internal/kataconfig/runtime_rs.go @@ -0,0 +1,38 @@ +// Copyright 2026 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +//go:build runtimers + +package kataconfig + +import ( + _ "embed" + + "github.com/edgelesssys/contrast/internal/platforms" +) + +var ( + // kataBareMetalQEMUTDXBaseConfig is the configuration file for the Kata runtime on bare-metal TDX + // with QEMU. + // + //go:embed configuration-qemu-tdx-rs.toml + kataBareMetalQEMUTDXBaseConfig string + // kataBareMetalQEMUSNPBaseConfig is the configuration file for the Kata runtime on bare-metal SNP + // with QEMU. + // + //go:embed configuration-qemu-snp-rs.toml + kataBareMetalQEMUSNPBaseConfig string +) + +func extraRuntimeConfig(config Config, platform platforms.Platform) Config { + config.Runtime["name"] = "virt_container" + config.Runtime["hypervisor_name"] = "qemu" + config.Runtime["agent_name"] = "kata" + config.Runtime["experimental"] = []string{"force_guest_pull"} + + config.Agent["kata"]["dial_timeout_ms"] = 1000 + config.Agent["kata"]["reconnect_timeout_ms"] = 60000 + config.Agent["kata"]["create_container_timeout"] = 120 + + return config +} diff --git a/nodeinstaller/internal/kataconfig/runtime_rs_test.go b/nodeinstaller/internal/kataconfig/runtime_rs_test.go new file mode 100644 index 00000000000..0cdf24ce79d --- /dev/null +++ b/nodeinstaller/internal/kataconfig/runtime_rs_test.go @@ -0,0 +1,19 @@ +// Copyright 2024 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +//go:build runtimers + +package kataconfig_test + +import _ "embed" + +var ( + //go:embed testdata/runtime-rs/expected-configuration-qemu-snp.toml + expectedConfMetalQEMUSNP []byte + //go:embed testdata/runtime-rs/expected-configuration-qemu-tdx.toml + expectedConfMetalQEMUTDX []byte + //go:embed testdata/runtime-rs/expected-configuration-qemu-snp-gpu.toml + expectedConfMetalQEMUSNPGPU []byte + //go:embed testdata/runtime-rs/expected-configuration-qemu-tdx-gpu.toml + expectedConfMetalQEMUTDXGPU []byte +) diff --git a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-snp-gpu.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-snp-gpu.toml similarity index 97% rename from nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-snp-gpu.toml rename to nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-snp-gpu.toml index 5548b81daa9..293f614a8c5 100644 --- a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-snp-gpu.toml +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-snp-gpu.toml @@ -1,5 +1,5 @@ -[Hypervisor] -[Hypervisor.qemu] +[hypervisor] +[hypervisor.qemu] block_device_aio = 'threads' block_device_cache_direct = false block_device_cache_noflush = false @@ -77,21 +77,21 @@ virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' virtio_fs_extra_args = ['--thread-pool-size=1', '--announce-submounts'] virtio_fs_queue_size = 1024 -[Agent] -[Agent.kata] +[agent] +[agent.kata] debug_console_enabled = false dial_timeout = 600 enable_debug = false enable_tracing = false kernel_modules = [] -[Factory] +[factory] enable_template = false template_path = '/run/vc/vm/template' vm_cache_endpoint = '/var/run/kata-containers/cache.sock' vm_cache_number = 0 -[Runtime] +[runtime] create_container_timeout = 600 dan_conf = '/run/kata-containers/dans' disable_guest_empty_dir = false diff --git a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-snp.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-snp.toml similarity index 97% rename from nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-snp.toml rename to nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-snp.toml index c6058473492..f82c1308ff8 100644 --- a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-snp.toml +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-snp.toml @@ -1,5 +1,5 @@ -[Hypervisor] -[Hypervisor.qemu] +[hypervisor] +[hypervisor.qemu] block_device_aio = 'threads' block_device_cache_direct = false block_device_cache_noflush = false @@ -76,21 +76,21 @@ virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' virtio_fs_extra_args = ['--thread-pool-size=1', '--announce-submounts'] virtio_fs_queue_size = 1024 -[Agent] -[Agent.kata] +[agent] +[agent.kata] debug_console_enabled = false dial_timeout = 120 enable_debug = false enable_tracing = false kernel_modules = [] -[Factory] +[factory] enable_template = false template_path = '/run/vc/vm/template' vm_cache_endpoint = '/var/run/kata-containers/cache.sock' vm_cache_number = 0 -[Runtime] +[runtime] create_container_timeout = 120 dan_conf = '/run/kata-containers/dans' disable_guest_empty_dir = false diff --git a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-tdx-gpu.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-tdx-gpu.toml similarity index 97% rename from nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-tdx-gpu.toml rename to nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-tdx-gpu.toml index 1f6d7641d83..2aa0689ba70 100644 --- a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-tdx-gpu.toml +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-tdx-gpu.toml @@ -1,5 +1,5 @@ -[Hypervisor] -[Hypervisor.qemu] +[hypervisor] +[hypervisor.qemu] block_device_aio = 'threads' block_device_cache_direct = false block_device_cache_noflush = false @@ -75,21 +75,21 @@ virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' virtio_fs_extra_args = ['--thread-pool-size=1', '--announce-submounts'] virtio_fs_queue_size = 1024 -[Agent] -[Agent.kata] +[agent] +[agent.kata] debug_console_enabled = false dial_timeout = 600 enable_debug = false enable_tracing = false kernel_modules = [] -[Factory] +[factory] enable_template = false template_path = '/run/vc/vm/template' vm_cache_endpoint = '/var/run/kata-containers/cache.sock' vm_cache_number = 0 -[Runtime] +[runtime] create_container_timeout = 600 dan_conf = '/run/kata-containers/dans' disable_guest_empty_dir = false diff --git a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-tdx.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-tdx.toml similarity index 97% rename from nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-tdx.toml rename to nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-tdx.toml index 57a556a4ef7..b885f215ecd 100644 --- a/nodeinstaller/internal/kataconfig/testdata/expected-configuration-qemu-tdx.toml +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-go/expected-configuration-qemu-tdx.toml @@ -1,5 +1,5 @@ -[Hypervisor] -[Hypervisor.qemu] +[hypervisor] +[hypervisor.qemu] block_device_aio = 'threads' block_device_cache_direct = false block_device_cache_noflush = false @@ -74,21 +74,21 @@ virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' virtio_fs_extra_args = ['--thread-pool-size=1', '--announce-submounts'] virtio_fs_queue_size = 1024 -[Agent] -[Agent.kata] +[agent] +[agent.kata] debug_console_enabled = false dial_timeout = 120 enable_debug = false enable_tracing = false kernel_modules = [] -[Factory] +[factory] enable_template = false template_path = '/run/vc/vm/template' vm_cache_endpoint = '/var/run/kata-containers/cache.sock' vm_cache_number = 0 -[Runtime] +[runtime] create_container_timeout = 120 dan_conf = '/run/kata-containers/dans' disable_guest_empty_dir = false diff --git a/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-snp-gpu.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-snp-gpu.toml new file mode 100644 index 00000000000..68510999d7a --- /dev/null +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-snp-gpu.toml @@ -0,0 +1,116 @@ +[hypervisor] +[hypervisor.qemu] +block_device_aio = 'threads' +block_device_cache_direct = false +block_device_cache_noflush = false +block_device_cache_set = false +block_device_driver = 'virtio-scsi' +confidential_guest = true +contrast_imagepuller_config = '' +cpu_features = 'pmu=off' +default_bridges = 1 +default_maxmemory = 0 +default_maxvcpus = 0 +default_memory = 1024 +default_vcpus = 1 +disable_block_device_use = false +disable_guest_selinux = true +disable_image_nvdimm = true +disable_nesting_checks = true +disable_selinux = false +disable_vhost_net = false +enable_annotations = ['snp_id_block_Milan', 'snp_id_auth_Milan', 'snp_guest_policy_Milan', 'snp_id_block_Genoa', 'snp_id_auth_Genoa', 'snp_guest_policy_Genoa', 'cc_init_data'] +enable_debug = false +enable_guest_swap = false +enable_hugepages = false +enable_iommu = false +enable_iommu_platform = false +enable_iothreads = false +enable_mem_prealloc = false +enable_vhost_user_store = false +enable_virtio_mem = false +entropy_source = '/dev/urandom' +file_mem_backend = '' +firmware = '/snp/share/OVMF.fd' +firmware_volume = '' +guest_hook_path = '' +guest_memory_dump_paging = false +guest_memory_dump_path = '' +image = '/share/kata-containers.img' +indep_iothreads = 0 +initrd = '/share/kata-initrd.zst' +kernel = '/share/kata-kernel' +kernel_params = '' +machine_accelerators = '' +machine_type = 'q35' +memory_offset = 0 +memory_slots = 10 +network_queues = 1 +path = '/bin/qemu-system-x86_64' +pcie_root_port = 0 +pflashes = [] +reclaim_guest_freed_memory = false +rootfs_type = 'erofs' +rootless = false +rx_rate_limiter_max_rate = 0 +seccompsandbox = '' +sev_snp_guest = true +shared_fs = 'none' +snp_guest_policy = 196608 +snp_id_auth = '' +snp_id_block = '' +tx_rate_limiter_max_rate = 0 +use_legacy_serial = false +valid_entropy_sources = ['/dev/urandom', '/dev/random', ''] +valid_file_mem_backends = [''] +valid_hypervisor_paths = ['/bin/qemu-system-x86_64'] +valid_vhost_user_store_paths = ['/var/run/kata-containers/vhost-user'] +valid_virtio_fs_daemon_paths = ['/opt/kata/libexec/virtiofsd'] +vhost_user_reconnect_timeout_sec = 0 +vhost_user_store_path = '/var/run/kata-containers/vhost-user' +virtio_fs_cache = 'auto' +virtio_fs_cache_size = 0 +virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' +virtio_fs_extra_args = ['--thread-pool-size=1', '-o', 'announce_submounts'] +virtio_fs_queue_size = 1024 +vm_rootfs_driver = 'virtio-blk-pci' + +[agent] +[agent.kata] +create_container_timeout = 120 +debug_console_enabled = false +dial_timeout_ms = 1000 +enable_debug = false +enable_tracing = false +kernel_modules = [] +reconnect_timeout_ms = 60000 + +[factory] +enable_template = false +template_path = '/run/vc/vm/template' +vm_cache_endpoint = '/var/run/kata-containers/cache.sock' +vm_cache_number = 0 + +[runtime] +agent_name = 'kata' +dan_conf = '/run/kata-containers/dans' +disable_guest_empty_dir = false +disable_guest_seccomp = true +disable_new_netns = false +enable_debug = false +enable_pprof = false +enable_tracing = false +enable_vcpus_pinning = false +experimental = ['force_guest_pull'] +guest_selinux_label = '' +hypervisor_name = 'qemu' +internetworking_model = 'tcfilter' +jaeger_endpoint = '' +jaeger_password = '' +jaeger_user = '' +name = 'virt_container' +pod_resource_api_sock = '' +sandbox_bind_mounts = [] +sandbox_cgroup_only = true +static_sandbox_resource_mgmt = true +vfio_mode = 'guest-kernel' diff --git a/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-snp.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-snp.toml new file mode 100644 index 00000000000..26198537405 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-snp.toml @@ -0,0 +1,116 @@ +[hypervisor] +[hypervisor.qemu] +block_device_aio = 'threads' +block_device_cache_direct = false +block_device_cache_noflush = false +block_device_cache_set = false +block_device_driver = 'virtio-scsi' +confidential_guest = true +contrast_imagepuller_config = '' +cpu_features = 'pmu=off' +default_bridges = 1 +default_maxmemory = 0 +default_maxvcpus = 0 +default_memory = 512 +default_vcpus = 1 +disable_block_device_use = false +disable_guest_selinux = true +disable_image_nvdimm = true +disable_nesting_checks = true +disable_selinux = false +disable_vhost_net = false +enable_annotations = ['snp_id_block_Milan', 'snp_id_auth_Milan', 'snp_guest_policy_Milan', 'snp_id_block_Genoa', 'snp_id_auth_Genoa', 'snp_guest_policy_Genoa', 'cc_init_data'] +enable_debug = false +enable_guest_swap = false +enable_hugepages = false +enable_iommu = false +enable_iommu_platform = false +enable_iothreads = false +enable_mem_prealloc = false +enable_vhost_user_store = false +enable_virtio_mem = false +entropy_source = '/dev/urandom' +file_mem_backend = '' +firmware = '/snp/share/OVMF.fd' +firmware_volume = '' +guest_hook_path = '' +guest_memory_dump_paging = false +guest_memory_dump_path = '' +image = '/share/kata-containers.img' +indep_iothreads = 0 +initrd = '/share/kata-initrd.zst' +kernel = '/share/kata-kernel' +kernel_params = '' +machine_accelerators = '' +machine_type = 'q35' +memory_offset = 0 +memory_slots = 10 +network_queues = 1 +path = '/bin/qemu-system-x86_64' +pcie_root_port = 0 +pflashes = [] +reclaim_guest_freed_memory = false +rootfs_type = 'erofs' +rootless = false +rx_rate_limiter_max_rate = 0 +seccompsandbox = '' +sev_snp_guest = true +shared_fs = 'none' +snp_guest_policy = 196608 +snp_id_auth = '' +snp_id_block = '' +tx_rate_limiter_max_rate = 0 +use_legacy_serial = false +valid_entropy_sources = ['/dev/urandom', '/dev/random', ''] +valid_file_mem_backends = [''] +valid_hypervisor_paths = ['/bin/qemu-system-x86_64'] +valid_vhost_user_store_paths = ['/var/run/kata-containers/vhost-user'] +valid_virtio_fs_daemon_paths = ['/opt/kata/libexec/virtiofsd'] +vhost_user_reconnect_timeout_sec = 0 +vhost_user_store_path = '/var/run/kata-containers/vhost-user' +virtio_fs_cache = 'auto' +virtio_fs_cache_size = 0 +virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' +virtio_fs_extra_args = ['--thread-pool-size=1', '-o', 'announce_submounts'] +virtio_fs_queue_size = 1024 +vm_rootfs_driver = 'virtio-blk-pci' + +[agent] +[agent.kata] +create_container_timeout = 120 +debug_console_enabled = false +dial_timeout_ms = 1000 +enable_debug = false +enable_tracing = false +kernel_modules = [] +reconnect_timeout_ms = 60000 + +[factory] +enable_template = false +template_path = '/run/vc/vm/template' +vm_cache_endpoint = '/var/run/kata-containers/cache.sock' +vm_cache_number = 0 + +[runtime] +agent_name = 'kata' +dan_conf = '/run/kata-containers/dans' +disable_guest_empty_dir = false +disable_guest_seccomp = true +disable_new_netns = false +enable_debug = false +enable_pprof = false +enable_tracing = false +enable_vcpus_pinning = false +experimental = ['force_guest_pull'] +guest_selinux_label = '' +hypervisor_name = 'qemu' +internetworking_model = 'tcfilter' +jaeger_endpoint = '' +jaeger_password = '' +jaeger_user = '' +name = 'virt_container' +pod_resource_api_sock = '' +sandbox_bind_mounts = [] +sandbox_cgroup_only = true +static_sandbox_resource_mgmt = true +vfio_mode = 'guest-kernel' diff --git a/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-tdx-gpu.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-tdx-gpu.toml new file mode 100644 index 00000000000..c250cc673a2 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-tdx-gpu.toml @@ -0,0 +1,115 @@ +[hypervisor] +[hypervisor.qemu] +block_device_aio = 'threads' +block_device_cache_direct = false +block_device_cache_noflush = false +block_device_cache_set = false +block_device_driver = 'virtio-scsi' +confidential_guest = true +contrast_imagepuller_config = '' +cpu_features = 'pmu=off' +default_bridges = 1 +default_maxmemory = 0 +default_maxvcpus = 0 +default_memory = 1024 +default_vcpus = 1 +disable_block_device_use = false +disable_guest_selinux = true +disable_image_nvdimm = true +disable_nesting_checks = false +disable_selinux = false +disable_vhost_net = false +enable_annotations = ['cc_init_data'] +enable_debug = false +enable_guest_swap = false +enable_hugepages = false +enable_iommu = false +enable_iommu_platform = false +enable_iothreads = false +enable_mem_prealloc = false +enable_vhost_user_store = false +enable_virtio_mem = false +entropy_source = '/dev/urandom' +extra_monitor_socket = '' +file_mem_backend = '' +firmware = '/tdx/share/OVMF.fd' +firmware_volume = '' +guest_hook_path = '' +guest_memory_dump_paging = false +guest_memory_dump_path = '' +image = '/share/kata-containers.img' +indep_iothreads = 0 +initrd = '/share/kata-initrd.zst' +kernel = '/share/kata-kernel' +kernel_params = '' +kernel_verity_params = '' +machine_accelerators = '' +machine_type = 'q35' +memory_offset = 0 +memory_slots = 10 +network_queues = 1 +path = '/bin/qemu-system-x86_64' +pcie_root_port = 0 +pflashes = [] +reclaim_guest_freed_memory = false +rootfs_type = 'erofs' +rootless = false +rx_rate_limiter_max_rate = 0 +seccompsandbox = '' +shared_fs = 'none' +tdx_quote_generation_service_socket_port = 4050 +tx_rate_limiter_max_rate = 0 +use_legacy_serial = false +valid_entropy_sources = ['/dev/urandom', '/dev/random', ''] +valid_file_mem_backends = [''] +valid_hypervisor_paths = ['/bin/qemu-system-x86_64'] +valid_vhost_user_store_paths = ['/var/run/kata-containers/vhost-user'] +valid_virtio_fs_daemon_paths = ['/opt/kata/libexec/virtiofsd'] +vhost_user_reconnect_timeout_sec = 0 +vhost_user_store_path = '/var/run/kata-containers/vhost-user' +virtio_fs_cache = 'auto' +virtio_fs_cache_size = 0 +virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' +virtio_fs_extra_args = ['--thread-pool-size=1', '-o', 'announce_submounts'] +virtio_fs_queue_size = 1024 +vm_rootfs_driver = 'virtio-blk-pci' + +[agent] +[agent.kata] +create_container_timeout = 120 +debug_console_enabled = false +dial_timeout_ms = 1000 +enable_debug = false +enable_tracing = false +kernel_modules = [] +reconnect_timeout_ms = 60000 + +[factory] +enable_template = false +template_path = '/run/vc/vm/template' +vm_cache_endpoint = '/var/run/kata-containers/cache.sock' +vm_cache_number = 0 + +[runtime] +agent_name = 'kata' +dan_conf = '/run/kata-containers/dans' +disable_guest_empty_dir = false +disable_guest_seccomp = true +disable_new_netns = false +enable_debug = false +enable_pprof = false +enable_tracing = false +enable_vcpus_pinning = false +experimental = ['force_guest_pull'] +guest_selinux_label = '' +hypervisor_name = 'qemu' +internetworking_model = 'tcfilter' +jaeger_endpoint = '' +jaeger_password = '' +jaeger_user = '' +name = 'virt_container' +pod_resource_api_sock = '' +sandbox_bind_mounts = [] +sandbox_cgroup_only = true +static_sandbox_resource_mgmt = true +vfio_mode = 'guest-kernel' diff --git a/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-tdx.toml b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-tdx.toml new file mode 100644 index 00000000000..d9b404a9b15 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/testdata/runtime-rs/expected-configuration-qemu-tdx.toml @@ -0,0 +1,115 @@ +[hypervisor] +[hypervisor.qemu] +block_device_aio = 'threads' +block_device_cache_direct = false +block_device_cache_noflush = false +block_device_cache_set = false +block_device_driver = 'virtio-scsi' +confidential_guest = true +contrast_imagepuller_config = '' +cpu_features = 'pmu=off' +default_bridges = 1 +default_maxmemory = 0 +default_maxvcpus = 0 +default_memory = 512 +default_vcpus = 1 +disable_block_device_use = false +disable_guest_selinux = true +disable_image_nvdimm = true +disable_nesting_checks = false +disable_selinux = false +disable_vhost_net = false +enable_annotations = ['cc_init_data'] +enable_debug = false +enable_guest_swap = false +enable_hugepages = false +enable_iommu = false +enable_iommu_platform = false +enable_iothreads = false +enable_mem_prealloc = false +enable_vhost_user_store = false +enable_virtio_mem = false +entropy_source = '/dev/urandom' +extra_monitor_socket = '' +file_mem_backend = '' +firmware = '/tdx/share/OVMF.fd' +firmware_volume = '' +guest_hook_path = '' +guest_memory_dump_paging = false +guest_memory_dump_path = '' +image = '/share/kata-containers.img' +indep_iothreads = 0 +initrd = '/share/kata-initrd.zst' +kernel = '/share/kata-kernel' +kernel_params = '' +kernel_verity_params = '' +machine_accelerators = '' +machine_type = 'q35' +memory_offset = 0 +memory_slots = 10 +network_queues = 1 +path = '/bin/qemu-system-x86_64' +pcie_root_port = 0 +pflashes = [] +reclaim_guest_freed_memory = false +rootfs_type = 'erofs' +rootless = false +rx_rate_limiter_max_rate = 0 +seccompsandbox = '' +shared_fs = 'none' +tdx_quote_generation_service_socket_port = 4050 +tx_rate_limiter_max_rate = 0 +use_legacy_serial = false +valid_entropy_sources = ['/dev/urandom', '/dev/random', ''] +valid_file_mem_backends = [''] +valid_hypervisor_paths = ['/bin/qemu-system-x86_64'] +valid_vhost_user_store_paths = ['/var/run/kata-containers/vhost-user'] +valid_virtio_fs_daemon_paths = ['/opt/kata/libexec/virtiofsd'] +vhost_user_reconnect_timeout_sec = 0 +vhost_user_store_path = '/var/run/kata-containers/vhost-user' +virtio_fs_cache = 'auto' +virtio_fs_cache_size = 0 +virtio_fs_daemon = '/opt/kata/libexec/virtiofsd' +virtio_fs_extra_args = ['--thread-pool-size=1', '-o', 'announce_submounts'] +virtio_fs_queue_size = 1024 +vm_rootfs_driver = 'virtio-blk-pci' + +[agent] +[agent.kata] +create_container_timeout = 120 +debug_console_enabled = false +dial_timeout_ms = 1000 +enable_debug = false +enable_tracing = false +kernel_modules = [] +reconnect_timeout_ms = 60000 + +[factory] +enable_template = false +template_path = '/run/vc/vm/template' +vm_cache_endpoint = '/var/run/kata-containers/cache.sock' +vm_cache_number = 0 + +[runtime] +agent_name = 'kata' +dan_conf = '/run/kata-containers/dans' +disable_guest_empty_dir = false +disable_guest_seccomp = true +disable_new_netns = false +enable_debug = false +enable_pprof = false +enable_tracing = false +enable_vcpus_pinning = false +experimental = ['force_guest_pull'] +guest_selinux_label = '' +hypervisor_name = 'qemu' +internetworking_model = 'tcfilter' +jaeger_endpoint = '' +jaeger_password = '' +jaeger_user = '' +name = 'virt_container' +pod_resource_api_sock = '' +sandbox_bind_mounts = [] +sandbox_cgroup_only = true +static_sandbox_resource_mgmt = true +vfio_mode = 'guest-kernel' diff --git a/nodeinstaller/internal/kataconfig/update-testdata/main.go b/nodeinstaller/internal/kataconfig/update-testdata/main.go index 5745844b732..ad93d0420b6 100644 --- a/nodeinstaller/internal/kataconfig/update-testdata/main.go +++ b/nodeinstaller/internal/kataconfig/update-testdata/main.go @@ -49,9 +49,9 @@ func main() { } for platform, platformConfig := range platforms { - upstreamFile := filepath.Join(tarball, "opt", "kata", "share", "defaults", "kata-containers", fmt.Sprintf("configuration-%s.toml", platformConfig.upstream)) - configFile := filepath.Join(gitroot, "nodeinstaller", "internal", "kataconfig", fmt.Sprintf("configuration-%s.toml", platformConfig.config)) - testdataFile := filepath.Join(gitroot, "nodeinstaller", "internal", "kataconfig", "testdata", fmt.Sprintf("expected-configuration-%s.toml", platformConfig.testdata)) + upstreamFile := upstreamFile(tarball, platformConfig.upstream) + configFile := filepath.Join(gitroot, "nodeinstaller", "internal", "kataconfig", fmt.Sprintf("configuration-%s-%s.toml", platformConfig.config, configSuffix)) + testdataFile := filepath.Join(gitroot, "nodeinstaller", "internal", "kataconfig", "testdata", testdataSubdir, fmt.Sprintf("expected-configuration-%s.toml", platformConfig.testdata)) upstream, err := os.ReadFile(upstreamFile) if err != nil { diff --git a/nodeinstaller/internal/kataconfig/update-testdata/runtime_go.go b/nodeinstaller/internal/kataconfig/update-testdata/runtime_go.go new file mode 100644 index 00000000000..59693d33aa1 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/update-testdata/runtime_go.go @@ -0,0 +1,20 @@ +// Copyright 2026 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +//go:build !runtimers + +package main + +import ( + "fmt" + "path/filepath" +) + +const ( + testdataSubdir = "runtime-go" + configSuffix = "go" +) + +func upstreamFile(tarball, platform string) string { + return filepath.Join(tarball, "opt", "kata", "share", "defaults", "kata-containers", fmt.Sprintf("configuration-%s.toml", platform)) +} diff --git a/nodeinstaller/internal/kataconfig/update-testdata/runtime_rs.go b/nodeinstaller/internal/kataconfig/update-testdata/runtime_rs.go new file mode 100644 index 00000000000..252296535d4 --- /dev/null +++ b/nodeinstaller/internal/kataconfig/update-testdata/runtime_rs.go @@ -0,0 +1,21 @@ +// Copyright 2026 Edgeless Systems GmbH +// SPDX-License-Identifier: BUSL-1.1 + +//go:build runtimers + +package main + +import ( + _ "embed" + "fmt" + "path/filepath" +) + +const ( + testdataSubdir = "runtime-rs" + configSuffix = "rs" +) + +func upstreamFile(tarball, platform string) string { + return filepath.Join(tarball, "opt", "kata", "share", "defaults", "kata-containers", "runtime-rs", fmt.Sprintf("configuration-%s-runtime-rs.toml", platform)) +} diff --git a/overlays/sets/runtime-rs.nix b/overlays/sets/runtime-rs.nix new file mode 100644 index 00000000000..fad952e1650 --- /dev/null +++ b/overlays/sets/runtime-rs.nix @@ -0,0 +1,30 @@ +# Copyright 2026 Edgeless Systems GmbH +# SPDX-License-Identifier: BUSL-1.1 + +_final: prev: { + contrastPkgs = prev.contrastPkgs.overrideScope ( + _final: prev: { + kata = + let + kata-runtime = prev.kata.runtime; + runtime-rs = prev.kata.runtime-rs.override { + runtime = kata-runtime; + }; + in + prev.kata.overrideScope ( + _final: _prev: { + runtime = runtime-rs; + } + ); + contrast = prev.contrast.overrideScope ( + _final: prev: { + nodeinstaller = prev.nodeinstaller.overrideAttrs ( + _finalAttrs: prevAttrs: { + tags = prevAttrs.tags or [ ] ++ [ "runtimers" ]; + } + ); + } + ); + } + ); +} diff --git a/packages/by-name/contrast/e2e/package.nix b/packages/by-name/contrast/e2e/package.nix index 96c64502370..d4fa8309dd1 100644 --- a/packages/by-name/contrast/e2e/package.nix +++ b/packages/by-name/contrast/e2e/package.nix @@ -81,6 +81,7 @@ buildGoModule { "e2e/proxy" "e2e/regression" "e2e/release" + "e2e/runtime-rs-tmp" "e2e/servicemesh" "e2e/vault" "e2e/volumestatefulset" diff --git a/packages/by-name/kata/runtime-rs/package.nix b/packages/by-name/kata/runtime-rs/package.nix new file mode 100644 index 00000000000..5a494857919 --- /dev/null +++ b/packages/by-name/kata/runtime-rs/package.nix @@ -0,0 +1,84 @@ +# Copyright 2026 Edgeless Systems GmbH +# SPDX-License-Identifier: BUSL-1.1 + +{ + lib, + rustPlatform, + runtime, + protobuf, + pkg-config, + openssl, + + withDragonball ? false, +}: + +rustPlatform.buildRustPackage (finalAttrs: { + pname = "kata-runtime-rs"; + inherit (runtime) version src; + + buildAndTestSubdir = "src/runtime-rs"; + + cargoLock = { + lockFile = "${finalAttrs.src}/Cargo.lock"; + outputHashes = { + "api_client-0.1.0" = "sha256-aWtVgYlcbssL7lQfMFGJah8DrJN0s/w1ZFncCPHT1aE="; + }; + }; + + postPatch = '' + substitute src/runtime-rs/crates/shim/src/config.rs.in src/runtime-rs/crates/shim/src/config.rs \ + --replace-fail @PROJECT_NAME@ "Kata Containers" \ + --replace-fail @RUNTIME_VERSION@ ${finalAttrs.version} \ + --replace-fail @COMMIT@ none \ + --replace-fail @RUNTIME_NAME@ containerd-shim-kata-v2 \ + --replace-fail @CONTAINERD_RUNTIME_NAME@ io.containerd.kata.v2 + ''; + + nativeBuildInputs = [ + pkg-config + protobuf + ]; + + buildInputs = [ + openssl + openssl.dev + ]; + + # Build.rs writes to src + postConfigure = '' + chmod -R +w . + ''; + + buildFeatures = lib.optional withDragonball "dragonball"; + + env.OPENSSL_NO_VENDOR = 1; + + cargoTestFlags = [ "--bins" ]; + + checkFlags = [ + # Tests need root privileges or other stuff not available in the sandbox. + "--skip=device::device_manager::tests::test_new_block_device" + "--skip=network::endpoint::endpoints_test::tests::test_ipvlan_construction" + "--skip=network::endpoint::endpoints_test::tests::test_macvlan_construction" + "--skip=network::endpoint::endpoints_test::tests::test_vlan_construction" + "--skip=test::test_new_hypervisor" + ]; + + # This is a placeholder to make this package compatible with the Go runtime, + # as the node-installer is configured to install this file. + # TODO(katexochen): Remove when switching to runtime-rs. + postInstall = '' + echo "placeholder, kata-runtime doesn't exist for runtime-rs" > $out/bin/kata-runtime + ''; + + passthru = { + inherit (runtime) cmdline; + }; + + meta = { + changelog = "https://github.com/kata-containers/kata-containers/releases/tag/${finalAttrs.version}"; + homepage = "https://github.com/kata-containers/kata-containers"; + mainProgram = "containerd-shim-kata-v2"; + license = lib.licenses.asl20; + }; +}) diff --git a/packages/by-name/kata/runtime/0007-runtime-rs-allow-initrd-AND-image-to-be-set.patch b/packages/by-name/kata/runtime/0007-runtime-rs-allow-initrd-AND-image-to-be-set.patch new file mode 100644 index 00000000000..375656f8133 --- /dev/null +++ b/packages/by-name/kata/runtime/0007-runtime-rs-allow-initrd-AND-image-to-be-set.patch @@ -0,0 +1,78 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Paul Meyer +Date: Mon, 30 Mar 2026 08:51:40 +0200 +Subject: [PATCH] runtime-rs: allow initrd AND image to be set + +Signed-off-by: Paul Meyer +--- + src/libs/kata-types/src/config/hypervisor/mod.rs | 5 ----- + src/runtime-rs/crates/hypervisor/ch-config/src/convert.rs | 4 ---- + .../crates/runtimes/virt_container/src/factory/vm.rs | 4 ---- + .../crates/runtimes/virt_container/src/sandbox.rs | 7 +++---- + 4 files changed, 3 insertions(+), 17 deletions(-) + +diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs +index 6284103102c58da41e88da7a263a77c247c2760d..04f3bb2d77900b110f7239353ab806dfaa9d32a7 100644 +--- a/src/libs/kata-types/src/config/hypervisor/mod.rs ++++ b/src/libs/kata-types/src/config/hypervisor/mod.rs +@@ -473,11 +473,6 @@ impl BootInfo { + validate_path!(self.image, "guest boot image file {} is invalid: {}")?; + validate_path!(self.initrd, "guest initrd image file {} is invalid: {}")?; + validate_path!(self.firmware, "firmware image file {} is invalid: {}")?; +- if !self.image.is_empty() && !self.initrd.is_empty() { +- return Err(std::io::Error::other( +- "Can not configure both initrd and image for boot", +- )); +- } + + let l = [ + VIRTIO_BLK_PCI, +diff --git a/src/runtime-rs/crates/hypervisor/ch-config/src/convert.rs b/src/runtime-rs/crates/hypervisor/ch-config/src/convert.rs +index cd6998a649f0ab0838d4c78f03af01c2ffef3839..bd124e5d74defefcaab67e1cfe60bfda690e3f4e 100644 +--- a/src/runtime-rs/crates/hypervisor/ch-config/src/convert.rs ++++ b/src/runtime-rs/crates/hypervisor/ch-config/src/convert.rs +@@ -130,10 +130,6 @@ impl TryFrom for VmConfig { + let use_initrd = !boot_info.initrd.is_empty(); + let use_image = !boot_info.image.is_empty(); + +- if use_initrd && use_image { +- return Err(VmConfigError::MultipleBootFiles); +- } +- + if !use_initrd && !use_image { + return Err(VmConfigError::NoBootFile); + } +diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/factory/vm.rs b/src/runtime-rs/crates/runtimes/virt_container/src/factory/vm.rs +index 9be1e68aed0e5fa9529fd0ad8e8db53b7a6887f1..9fb0d82a91f1f32c2cea609a32e8f30d814cb977 100644 +--- a/src/runtime-rs/crates/runtimes/virt_container/src/factory/vm.rs ++++ b/src/runtime-rs/crates/runtimes/virt_container/src/factory/vm.rs +@@ -111,10 +111,6 @@ impl VmConfig { + return Err(anyhow!("missing image and initrd path")); + } + +- if has_image && has_initrd { +- return Err(anyhow!("image and initrd path cannot both be set")); +- } +- + Ok(()) + } + +diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +index 232c0547802bfd07a0432497ea1aab2ed052e2df..dd222d191e328190ab64a469f608a7ce2a9af957 100644 +--- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs ++++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +@@ -356,11 +356,10 @@ impl VirtSandbox { + let boot_info = self.hypervisor.hypervisor_config().await.boot_info; + let security_info = self.hypervisor.hypervisor_config().await.security_info; + +- if !boot_info.initrd.is_empty() { +- return Ok(None); +- } +- + if boot_info.image.is_empty() { ++ if !boot_info.initrd.is_empty() { ++ return Ok(None); ++ } + let is_remote_hypervisor = Arc::clone(&self.resource_manager.config().await) + .runtime + .hypervisor_name diff --git a/packages/by-name/kata/runtime/0007-genpolicy-do-not-log-policy-annotation-in-debug.patch b/packages/by-name/kata/runtime/0008-genpolicy-do-not-log-policy-annotation-in-debug.patch similarity index 100% rename from packages/by-name/kata/runtime/0007-genpolicy-do-not-log-policy-annotation-in-debug.patch rename to packages/by-name/kata/runtime/0008-genpolicy-do-not-log-policy-annotation-in-debug.patch diff --git a/packages/by-name/kata/runtime/0008-genpolicy-support-ephemeral-volume-source.patch b/packages/by-name/kata/runtime/0009-genpolicy-support-ephemeral-volume-source.patch similarity index 100% rename from packages/by-name/kata/runtime/0008-genpolicy-support-ephemeral-volume-source.patch rename to packages/by-name/kata/runtime/0009-genpolicy-support-ephemeral-volume-source.patch diff --git a/packages/by-name/kata/runtime/0009-genpolicy-don-t-allow-mount-storage-for-declared-VOL.patch b/packages/by-name/kata/runtime/0010-genpolicy-don-t-allow-mount-storage-for-declared-VOL.patch similarity index 100% rename from packages/by-name/kata/runtime/0009-genpolicy-don-t-allow-mount-storage-for-declared-VOL.patch rename to packages/by-name/kata/runtime/0010-genpolicy-don-t-allow-mount-storage-for-declared-VOL.patch diff --git a/packages/by-name/kata/runtime/0010-agent-use-custom-implementation-for-image-pulling.patch b/packages/by-name/kata/runtime/0011-agent-use-custom-implementation-for-image-pulling.patch similarity index 100% rename from packages/by-name/kata/runtime/0010-agent-use-custom-implementation-for-image-pulling.patch rename to packages/by-name/kata/runtime/0011-agent-use-custom-implementation-for-image-pulling.patch diff --git a/packages/by-name/kata/runtime/0011-agent-use-separate-unix-socket-for-image-pulling.patch b/packages/by-name/kata/runtime/0012-agent-use-separate-unix-socket-for-image-pulling.patch similarity index 100% rename from packages/by-name/kata/runtime/0011-agent-use-separate-unix-socket-for-image-pulling.patch rename to packages/by-name/kata/runtime/0012-agent-use-separate-unix-socket-for-image-pulling.patch diff --git a/packages/by-name/kata/runtime/0012-agent-use-custom-implementation-for-secure-mounting.patch b/packages/by-name/kata/runtime/0013-agent-use-custom-implementation-for-secure-mounting.patch similarity index 100% rename from packages/by-name/kata/runtime/0012-agent-use-custom-implementation-for-secure-mounting.patch rename to packages/by-name/kata/runtime/0013-agent-use-custom-implementation-for-secure-mounting.patch diff --git a/packages/by-name/kata/runtime/0013-genpolicy-don-t-apply-Nydus-workaround.patch b/packages/by-name/kata/runtime/0014-genpolicy-don-t-apply-Nydus-workaround.patch similarity index 100% rename from packages/by-name/kata/runtime/0013-genpolicy-don-t-apply-Nydus-workaround.patch rename to packages/by-name/kata/runtime/0014-genpolicy-don-t-apply-Nydus-workaround.patch diff --git a/packages/by-name/kata/runtime/0014-agent-remove-initdata-processing.patch b/packages/by-name/kata/runtime/0015-agent-remove-initdata-processing.patch similarity index 100% rename from packages/by-name/kata/runtime/0014-agent-remove-initdata-processing.patch rename to packages/by-name/kata/runtime/0015-agent-remove-initdata-processing.patch diff --git a/packages/by-name/kata/runtime/0015-runtime-pass-imagepuller-config-device-to-vm.patch b/packages/by-name/kata/runtime/0016-runtime-pass-imagepuller-config-device-to-vm.patch similarity index 100% rename from packages/by-name/kata/runtime/0015-runtime-pass-imagepuller-config-device-to-vm.patch rename to packages/by-name/kata/runtime/0016-runtime-pass-imagepuller-config-device-to-vm.patch diff --git a/packages/by-name/kata/runtime/0016-runtime-assign-GPU-devices-to-multiple-containers.patch b/packages/by-name/kata/runtime/0017-runtime-assign-GPU-devices-to-multiple-containers.patch similarity index 100% rename from packages/by-name/kata/runtime/0016-runtime-assign-GPU-devices-to-multiple-containers.patch rename to packages/by-name/kata/runtime/0017-runtime-assign-GPU-devices-to-multiple-containers.patch diff --git a/packages/by-name/kata/runtime/0017-runtime-remove-iommu-device.patch b/packages/by-name/kata/runtime/0018-runtime-remove-iommu-device.patch similarity index 100% rename from packages/by-name/kata/runtime/0017-runtime-remove-iommu-device.patch rename to packages/by-name/kata/runtime/0018-runtime-remove-iommu-device.patch diff --git a/packages/by-name/kata/runtime/0018-genpolicy-retry-failed-image-pulls.patch b/packages/by-name/kata/runtime/0019-genpolicy-retry-failed-image-pulls.patch similarity index 100% rename from packages/by-name/kata/runtime/0018-genpolicy-retry-failed-image-pulls.patch rename to packages/by-name/kata/runtime/0019-genpolicy-retry-failed-image-pulls.patch diff --git a/packages/by-name/kata/runtime/0019-shim-guess-CDI-devices-without-direct-match.patch b/packages/by-name/kata/runtime/0020-shim-guess-CDI-devices-without-direct-match.patch similarity index 100% rename from packages/by-name/kata/runtime/0019-shim-guess-CDI-devices-without-direct-match.patch rename to packages/by-name/kata/runtime/0020-shim-guess-CDI-devices-without-direct-match.patch diff --git a/packages/by-name/kata/runtime/0020-runtime-do-not-add-nr_vcpus-to-kernel-command-line.patch b/packages/by-name/kata/runtime/0021-runtime-do-not-add-nr_vcpus-to-kernel-command-line.patch similarity index 100% rename from packages/by-name/kata/runtime/0020-runtime-do-not-add-nr_vcpus-to-kernel-command-line.patch rename to packages/by-name/kata/runtime/0021-runtime-do-not-add-nr_vcpus-to-kernel-command-line.patch diff --git a/packages/by-name/kata/runtime/0021-runtime-add-SNP-ID-block-from-Pod-annotations.patch b/packages/by-name/kata/runtime/0022-runtime-add-SNP-ID-block-from-Pod-annotations.patch similarity index 100% rename from packages/by-name/kata/runtime/0021-runtime-add-SNP-ID-block-from-Pod-annotations.patch rename to packages/by-name/kata/runtime/0022-runtime-add-SNP-ID-block-from-Pod-annotations.patch diff --git a/packages/by-name/kata/runtime/0023-runtime-rs-add-SNP-ID-block-from-Pod-annotations.patch b/packages/by-name/kata/runtime/0023-runtime-rs-add-SNP-ID-block-from-Pod-annotations.patch new file mode 100644 index 00000000000..87e8fbd1988 --- /dev/null +++ b/packages/by-name/kata/runtime/0023-runtime-rs-add-SNP-ID-block-from-Pod-annotations.patch @@ -0,0 +1,231 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Paul Meyer +Date: Thu, 2 Apr 2026 12:38:29 +0200 +Subject: [PATCH] runtime-rs: add SNP ID block from Pod annotations + +Signed-off-by: Paul Meyer +--- + src/libs/kata-types/src/annotations/mod.rs | 37 ++++++++++ + .../kata-types/src/config/hypervisor/mod.rs | 26 +++++++ + .../hypervisor/src/qemu/cmdline_generator.rs | 74 ++++++++++++++++++- + 3 files changed, 135 insertions(+), 2 deletions(-) + +diff --git a/src/libs/kata-types/src/annotations/mod.rs b/src/libs/kata-types/src/annotations/mod.rs +index 249742df51b1f14bba5420becd746af2ae587a44..e0737d74840f7190151edd7f0aa4459e381c3543 100644 +--- a/src/libs/kata-types/src/annotations/mod.rs ++++ b/src/libs/kata-types/src/annotations/mod.rs +@@ -277,6 +277,19 @@ pub const KATA_ANNO_CFG_HYPERVISOR_MSIZE_9P: &str = "io.katacontainers.config.hy + /// The initdata annotation passed in when CVM launchs + pub const KATA_ANNO_CFG_HYPERVISOR_INIT_DATA: &str = + "io.katacontainers.config.hypervisor.cc_init_data"; ++/// Suffix added to SNP related annotations for the Milan platform. ++pub const KATA_ANNO_SNP_SUFFIX_MILAN: &str = "_Milan"; ++/// Suffix added to SNP related annotations for the Genoa platform. ++pub const KATA_ANNO_SNP_SUFFIX_GENOA: &str = "_Genoa"; ++/// A sandbox annotation to provide a custom SNP ID block for the VM. ++pub const KATA_ANNO_CFG_HYPERVISOR_SNP_ID_BLOCK: &str = ++ "io.katacontainers.config.hypervisor.snp_id_block"; ++/// A sandbox annotation to provide a custom SNP ID auth for the VM. ++pub const KATA_ANNO_CFG_HYPERVISOR_SNP_ID_AUTH: &str = ++ "io.katacontainers.config.hypervisor.snp_id_auth"; ++/// A sandbox annotation to specify the SNP guest policy for the VM. ++pub const KATA_ANNO_CFG_HYPERVISOR_SNP_GUEST_POLICY: &str = ++ "io.katacontainers.config.hypervisor.snp_guest_policy"; + + /// GPU specific annotations for remote hypervisor to help with instance selection + /// It's for minimum number of GPUs required for the VM. +@@ -925,6 +938,30 @@ impl Annotation { + hv.security_info.initdata = + add_hypervisor_initdata_overrides(value).unwrap(); + } ++ key if key == format!("{}{}", KATA_ANNO_CFG_HYPERVISOR_SNP_ID_BLOCK, KATA_ANNO_SNP_SUFFIX_MILAN) => { ++ hv.security_info.snp_id_block_milan = value.to_string(); ++ } ++ key if key == format!("{}{}", KATA_ANNO_CFG_HYPERVISOR_SNP_ID_BLOCK, KATA_ANNO_SNP_SUFFIX_GENOA) => { ++ hv.security_info.snp_id_block_genoa = value.to_string(); ++ } ++ key if key == format!("{}{}", KATA_ANNO_CFG_HYPERVISOR_SNP_ID_AUTH, KATA_ANNO_SNP_SUFFIX_MILAN) => { ++ hv.security_info.snp_id_auth_milan = value.to_string(); ++ } ++ key if key == format!("{}{}", KATA_ANNO_CFG_HYPERVISOR_SNP_ID_AUTH, KATA_ANNO_SNP_SUFFIX_GENOA) => { ++ hv.security_info.snp_id_auth_genoa = value.to_string(); ++ } ++ key if key == format!("{}{}", KATA_ANNO_CFG_HYPERVISOR_SNP_GUEST_POLICY, KATA_ANNO_SNP_SUFFIX_MILAN) => { ++ match value.parse::() { ++ Ok(policy) => hv.security_info.snp_guest_policy_milan = Some(policy), ++ Err(_) => return Err(u32_err), ++ } ++ } ++ key if key == format!("{}{}", KATA_ANNO_CFG_HYPERVISOR_SNP_GUEST_POLICY, KATA_ANNO_SNP_SUFFIX_GENOA) => { ++ match value.parse::() { ++ Ok(policy) => hv.security_info.snp_guest_policy_genoa = Some(policy), ++ Err(_) => return Err(u32_err), ++ } ++ } + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_GPUS => match self.get_value::(key) { + Ok(r) => { + hv.remote_info.default_gpus = r.unwrap_or_default(); +diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs +index 04f3bb2d77900b110f7239353ab806dfaa9d32a7..29b7aefb281358f9f7618e77e7db31e73221198b 100644 +--- a/src/libs/kata-types/src/config/hypervisor/mod.rs ++++ b/src/libs/kata-types/src/config/hypervisor/mod.rs +@@ -1236,6 +1236,32 @@ pub struct SecurityInfo { + #[serde(default = "default_snp_guest_policy")] + pub snp_guest_policy: u32, + ++ /// Per-CPU-model SNP 'ID Block' for Milan (96-byte, base64-encoded). ++ #[serde(default)] ++ pub snp_id_block_milan: String, ++ ++ /// Per-CPU-model SNP 'ID Authentication Information Structure' for Milan (4096-byte, base64-encoded). ++ #[serde(default)] ++ pub snp_id_auth_milan: String, ++ ++ /// Per-CPU-model SNP Guest Policy for Milan. ++ /// If unset, falls back to `snp_guest_policy`. ++ #[serde(default)] ++ pub snp_guest_policy_milan: Option, ++ ++ /// Per-CPU-model SNP 'ID Block' for Genoa (96-byte, base64-encoded). ++ #[serde(default)] ++ pub snp_id_block_genoa: String, ++ ++ /// Per-CPU-model SNP 'ID Authentication Information Structure' for Genoa (4096-byte, base64-encoded). ++ #[serde(default)] ++ pub snp_id_auth_genoa: String, ++ ++ /// Per-CPU-model SNP Guest Policy for Genoa. ++ /// If unset, falls back to `snp_guest_policy`. ++ #[serde(default)] ++ pub snp_guest_policy_genoa: Option, ++ + /// Path to OCI hook binaries in the *guest rootfs*. + /// + /// This setting does not affect host-side hooks, which must instead be +diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +index b5bad8eb64f9fb57aff62d6aaee0fe693c90be25..28837281465bba7b89d903dda11c1eae43457fe3 100644 +--- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs ++++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +@@ -1881,6 +1881,8 @@ struct ObjectSevSnpGuest { + host_data: Option, + policy: u32, + is_snp: bool, ++ id_block: String, ++ id_auth: String, + } + + impl ObjectSevSnpGuest { +@@ -1893,6 +1895,8 @@ impl ObjectSevSnpGuest { + host_data, + policy: 0x30000, + is_snp, ++ id_block: String::new(), ++ id_auth: String::new(), + } + } + +@@ -1900,6 +1904,16 @@ impl ObjectSevSnpGuest { + self.policy = policy; + self + } ++ ++ fn set_id_block(&mut self, id_block: String) -> &mut Self { ++ self.id_block = id_block; ++ self ++ } ++ ++ fn set_id_auth(&mut self, id_auth: String) -> &mut Self { ++ self.id_auth = id_auth; ++ self ++ } + } + + #[async_trait] +@@ -1926,6 +1940,12 @@ impl ToQemuParams for ObjectSevSnpGuest { + if let Some(host_data) = &self.host_data { + params.push(format!("host-data={host_data}")) + } ++ if !self.id_block.is_empty() { ++ params.push(format!("id-block={}", self.id_block)); ++ } ++ if !self.id_auth.is_empty() { ++ params.push(format!("id-auth={}", self.id_auth)); ++ } + } + Ok(vec!["-object".to_owned(), params.join(",")]) + } +@@ -2185,6 +2205,29 @@ fn is_running_in_vm() -> Result { + Ok(res) + } + ++const CPU_MODEL_EPYC_MILAN: &str = "EPYC-Milan"; ++const CPU_MODEL_EPYC_GENOA: &str = "EPYC-Genoa"; ++const CPU_MODEL_EPYC_GENERIC: &str = "EPYC-v4"; ++ ++fn detect_amd_cpu_model() -> &'static str { ++ let cpuinfo = match read_to_string("/proc/cpuinfo") { ++ Ok(s) => s, ++ Err(_) => return CPU_MODEL_EPYC_GENERIC, ++ }; ++ ++ let model = cpuinfo ++ .lines() ++ .find(|line| line.starts_with("model\t")) ++ .and_then(|line| line.split(':').nth(1)) ++ .and_then(|val| val.trim().parse::().ok()); ++ ++ match model { ++ Some(0x01) => CPU_MODEL_EPYC_MILAN, ++ Some(0x11) => CPU_MODEL_EPYC_GENOA, ++ _ => CPU_MODEL_EPYC_GENERIC, ++ } ++} ++ + fn should_disable_modern() -> bool { + match is_running_in_vm() { + Ok(retval) => retval, +@@ -2583,7 +2626,34 @@ impl<'a> QemuCmdLine<'a> { + + let mut sev_snp_object = + ObjectSevSnpGuest::new(true, cbitpos, phys_addr_reduction, host_data.clone()); +- sev_snp_object.set_policy(self.config.security_info.snp_guest_policy); ++ ++ let cpu_model = detect_amd_cpu_model(); ++ ++ match cpu_model { ++ CPU_MODEL_EPYC_MILAN => { ++ if let Some(policy) = self.config.security_info.snp_guest_policy_milan { ++ sev_snp_object.set_policy(policy); ++ } else { ++ sev_snp_object.set_policy(self.config.security_info.snp_guest_policy); ++ } ++ sev_snp_object.set_id_block(self.config.security_info.snp_id_block_milan.clone()); ++ sev_snp_object.set_id_auth(self.config.security_info.snp_id_auth_milan.clone()); ++ } ++ CPU_MODEL_EPYC_GENOA => { ++ if let Some(policy) = self.config.security_info.snp_guest_policy_genoa { ++ sev_snp_object.set_policy(policy); ++ } else { ++ sev_snp_object.set_policy(self.config.security_info.snp_guest_policy); ++ } ++ sev_snp_object.set_id_block(self.config.security_info.snp_id_block_genoa.clone()); ++ sev_snp_object.set_id_auth(self.config.security_info.snp_id_auth_genoa.clone()); ++ } ++ _ => { ++ sev_snp_object.set_policy(self.config.security_info.snp_guest_policy); ++ sev_snp_object.set_id_block(self.config.security_info.snp_id_block.clone()); ++ sev_snp_object.set_id_auth(self.config.security_info.snp_id_auth.clone()); ++ } ++ } + + self.devices.push(Box::new(sev_snp_object)); + +@@ -2594,7 +2664,7 @@ impl<'a> QemuCmdLine<'a> { + .set_confidential_guest_support("snp") + .set_nvdimm(false); + +- self.cpu.set_type("EPYC-v4"); ++ self.cpu.set_type(cpu_model); + } + + pub fn add_tdx_protection_device( diff --git a/packages/by-name/kata/runtime/0024-runtime-rs-deny-unknown-fields-in-config.patch b/packages/by-name/kata/runtime/0024-runtime-rs-deny-unknown-fields-in-config.patch new file mode 100644 index 00000000000..9965926b31c --- /dev/null +++ b/packages/by-name/kata/runtime/0024-runtime-rs-deny-unknown-fields-in-config.patch @@ -0,0 +1,130 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Paul Meyer +Date: Mon, 30 Mar 2026 11:40:23 +0200 +Subject: [PATCH] runtime-rs: deny unknown fields in config + +Signed-off-by: Paul Meyer +--- + src/libs/kata-types/src/config/agent.rs | 2 ++ + src/libs/kata-types/src/config/hypervisor/mod.rs | 2 ++ + src/libs/kata-types/src/config/shared_mount.rs | 7 ++----- + .../kata-types/tests/texture/configuration-anno-0.toml | 4 ++-- + .../kata-types/tests/texture/configuration-anno-1.toml | 4 ++-- + 5 files changed, 10 insertions(+), 9 deletions(-) + +diff --git a/src/libs/kata-types/src/config/agent.rs b/src/libs/kata-types/src/config/agent.rs +index 07f97f45084a4b1b1db870953f84525a51ac6743..dab97f1096f1621fca9a5c98b5ff24ac4abf5aa2 100644 +--- a/src/libs/kata-types/src/config/agent.rs ++++ b/src/libs/kata-types/src/config/agent.rs +@@ -19,6 +19,7 @@ use super::default::{ + pub const AGENT_NAME_KATA: &str = "kata"; + + #[derive(Default, Debug, Deserialize, Serialize, Clone)] ++#[serde(deny_unknown_fields)] + pub struct MemAgent { + #[serde(default, alias = "mem_agent_enable")] + pub enable: bool, +@@ -58,6 +59,7 @@ pub struct MemAgent { + + /// Kata agent configuration information. + #[derive(Debug, Deserialize, Serialize, Clone)] ++#[serde(deny_unknown_fields)] + pub struct Agent { + /// If enabled, the agent will log additional debug messages to the system log. + #[serde(default, rename = "enable_debug")] +diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs +index 29b7aefb281358f9f7618e77e7db31e73221198b..5fb3d90a1d3649b549c9706c3bead89e1dd4eb41 100644 +--- a/src/libs/kata-types/src/config/hypervisor/mod.rs ++++ b/src/libs/kata-types/src/config/hypervisor/mod.rs +@@ -1158,6 +1158,7 @@ impl NetworkInfo { + + /// Configuration information for rootless user. + #[derive(Clone, Debug, Default, Deserialize, Serialize)] ++#[serde(deny_unknown_fields)] + pub struct RootlessUser { + /// The UID of the rootless user. + #[serde(default)] +@@ -1589,6 +1590,7 @@ impl VmTemplateInfo { + + /// Configuration information for VM factory (templating, caches, etc.). + #[derive(Clone, Debug, Default, Deserialize, Serialize)] ++#[serde(deny_unknown_fields)] + pub struct Factory { + /// Enable VM templating support. + /// When enabled, new VMs may be created from a template to speed up creation. +diff --git a/src/libs/kata-types/src/config/shared_mount.rs b/src/libs/kata-types/src/config/shared_mount.rs +index e02342a3ecdfc59744be7b986e5921f0353b4191..d1f8a6c4829243d21975d2402aeb9594f057a483 100644 +--- a/src/libs/kata-types/src/config/shared_mount.rs ++++ b/src/libs/kata-types/src/config/shared_mount.rs +@@ -8,6 +8,7 @@ use std::io::Result; + use regex::Regex; + + #[derive(Debug, Deserialize, Serialize, Clone, Default)] ++#[serde(deny_unknown_fields)] + pub struct SharedMount { + /// Name is used to identify a pair of shared mount points. + /// This field cannot be omitted. +@@ -143,7 +144,6 @@ mod tests { + shared_mount_annotation: r#" + { + "name": "test", +- "src": "sidecar", + "src_path": "/mnt/storage", + "dst_ctr": "app", + "dst_path": "/mnt/storage" +@@ -156,7 +156,6 @@ mod tests { + { + "name": "test", + "src_ctr": "sidecar", +- "src_dir": "/mnt/storage", + "dst_ctr": "app", + "dst_path": "/mnt/storage" + }"#, +@@ -169,7 +168,6 @@ mod tests { + "name": "test", + "src_ctr": "sidecar", + "src_path": "/mnt/storage", +- "dst_container": "app", + "dst_path": "/mnt/storage" + }"#, + result: false, +@@ -181,8 +179,7 @@ mod tests { + "name": "test", + "src_ctr": "sidecar", + "src_path": "/mnt/storage", +- "dst_ctr": "app", +- "path": "/mnt/storage" ++ "dst_ctr": "app" + }"#, + result: false, + message: "shared_mount: field 'dst_path' couldn't be empty.", +diff --git a/src/libs/kata-types/tests/texture/configuration-anno-0.toml b/src/libs/kata-types/tests/texture/configuration-anno-0.toml +index a2f1dac02e8932bf2fd3c23f71d7260ad3aee2b8..5d93c36b5f16d875f5709ad0b4b7a76881a0e46c 100644 +--- a/src/libs/kata-types/tests/texture/configuration-anno-0.toml ++++ b/src/libs/kata-types/tests/texture/configuration-anno-0.toml +@@ -65,8 +65,8 @@ enable_guest_swap = true + [agent.agent0] + enable_tracing = true + debug_console_enabled = true +-debug = true +-dial_timeout = 1 ++enable_debug = true ++dial_timeout_ms = 1000 + kernel_modules = ["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1","i915_enabled_ppgtt=0"] + container_pipe_size = 2 + [runtime] +diff --git a/src/libs/kata-types/tests/texture/configuration-anno-1.toml b/src/libs/kata-types/tests/texture/configuration-anno-1.toml +index 12a4e85f94d0883863563b9a8968cf56a4fb06f5..dce864839fdfc433b39c5703a9017576ef021553 100644 +--- a/src/libs/kata-types/tests/texture/configuration-anno-1.toml ++++ b/src/libs/kata-types/tests/texture/configuration-anno-1.toml +@@ -64,8 +64,8 @@ enable_guest_swap = true + [agent.agent0] + enable_tracing = true + debug_console_enabled = true +-debug = true +-dial_timeout = 1 ++enable_debug = true ++dial_timeout_ms = 1000 + kernel_modules = ["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1","i915_enabled_ppgtt=0"] + container_pipe_size = 2 + [runtime] diff --git a/packages/by-name/kata/runtime/0025-runtime-rs-force-virtio-blk-with-serial-name-for-ini.patch b/packages/by-name/kata/runtime/0025-runtime-rs-force-virtio-blk-with-serial-name-for-ini.patch new file mode 100644 index 00000000000..8225ef7156b --- /dev/null +++ b/packages/by-name/kata/runtime/0025-runtime-rs-force-virtio-blk-with-serial-name-for-ini.patch @@ -0,0 +1,122 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Paul Meyer +Date: Mon, 30 Mar 2026 16:14:24 +0200 +Subject: [PATCH] runtime-rs: force virtio-blk with serial name for initdata + +Signed-off-by: Paul Meyer +--- + .../hypervisor/src/qemu/cmdline_generator.rs | 14 +++++++++++--- + src/runtime-rs/crates/hypervisor/src/qemu/inner.rs | 13 ++++++++++--- + .../crates/runtimes/virt_container/src/sandbox.rs | 5 ++--- + 3 files changed, 23 insertions(+), 9 deletions(-) + +diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +index 28837281465bba7b89d903dda11c1eae43457fe3..6f310448f1ff799f1a6ee0c2468164657be333c9 100644 +--- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs ++++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +@@ -1042,6 +1042,7 @@ struct DeviceVirtioBlk { + config_wce: bool, + share_rw: bool, + devno: Option, ++ serial: Option, + } + + impl DeviceVirtioBlk { +@@ -1052,6 +1053,7 @@ impl DeviceVirtioBlk { + config_wce: false, + share_rw: true, + devno, ++ serial: None, + } + } + +@@ -1084,7 +1086,11 @@ impl ToQemuParams for DeviceVirtioBlk { + } else { + params.push("share-rw=off".to_owned()); + } +- params.push(format!("serial=image-{}", self.id)); ++ if let Some(serial) = &self.serial { ++ params.push(format!("serial={serial}")); ++ } else { ++ params.push(format!("serial=image-{}", self.id)); ++ } + if let Some(devno) = &self.devno { + params.push(format!("devno={devno}")); + } +@@ -2506,6 +2512,7 @@ impl<'a> QemuCmdLine<'a> { + path: &str, + is_direct: bool, + is_scsi: bool, ++ serial: Option<&str>, + ) -> Result<()> { + self.devices + .push(Box::new(BlockBackend::new(device_id, path, is_direct))); +@@ -2514,8 +2521,9 @@ impl<'a> QemuCmdLine<'a> { + self.devices + .push(Box::new(DeviceScsiHd::new(device_id, "scsi0.0", devno))); + } else { +- self.devices +- .push(Box::new(DeviceVirtioBlk::new(device_id, bus_type(), devno))); ++ let mut dev = DeviceVirtioBlk::new(device_id, bus_type(), devno); ++ dev.serial = serial.map(|s| s.to_owned()); ++ self.devices.push(Box::new(dev)); + } + + Ok(()) +diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +index bfaf6380aa11bb8bb42156d6add0dd71d670b709..9c7b3e53a17eefad9ac2ef872a1ab185c0f38169 100644 +--- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs ++++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +@@ -138,8 +138,13 @@ impl QemuInner { + &block_dev.config.path_on_host, + block_dev.config.is_readonly, + )?, +- KATA_CCW_DEV_TYPE | KATA_BLK_DEV_TYPE | KATA_SCSI_DEV_TYPE => cmdline +- .add_block_device( ++ KATA_CCW_DEV_TYPE | KATA_BLK_DEV_TYPE | KATA_SCSI_DEV_TYPE => { ++ let serial = if block_dev.config.path_on_host.ends_with("initdata.image") { ++ Some("initdata") ++ } else { ++ None ++ }; ++ cmdline.add_block_device( + block_dev.device_id.as_str(), + &block_dev.config.path_on_host, + block_dev +@@ -147,7 +152,9 @@ impl QemuInner { + .is_direct + .unwrap_or(self.config.blockdev_info.block_device_cache_direct), + block_dev.config.driver_option.as_str() == KATA_SCSI_DEV_TYPE, +- )?, ++ serial, ++ )? ++ } + unsupported => { + info!(sl!(), "unsupported block device driver: {}", unsupported) + } +diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +index dd222d191e328190ab64a469f608a7ce2a9af957..5f1f14628f4cbea1e35612107e2825b75ac3da98 100644 +--- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs ++++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +@@ -34,7 +34,7 @@ use hypervisor::{ + utils::{get_hvsock_path, uses_native_ccw_bus}, + HybridVsockConfig, DEFAULT_GUEST_VSOCK_CID, + }; +-use hypervisor::{BlockConfig, Hypervisor}; ++use hypervisor::{BlockConfig, Hypervisor, VIRTIO_BLOCK_PCI}; + use hypervisor::{BlockDeviceAio, PortDeviceConfig}; + use hypervisor::{ProtectionDeviceConfig, SevSnpConfig, TdxConfig}; + use kata_sys_util::hooks::HookStates; +@@ -541,11 +541,10 @@ impl VirtSandbox { + sl!(), + "initdata push data into compressed block: {:?}", &image_path + ); +- let block_driver = &hypervisor_config.blockdev_info.block_device_driver; + let block_config = BlockConfig { + path_on_host: image_path.display().to_string(), + is_readonly: true, +- driver_option: block_driver.clone(), ++ driver_option: VIRTIO_BLOCK_PCI.to_string(), + blkdev_aio: BlockDeviceAio::Native, + ..Default::default() + }; diff --git a/packages/by-name/kata/runtime/package.nix b/packages/by-name/kata/runtime/package.nix index 77af2afabde..5390685baa3 100644 --- a/packages/by-name/kata/runtime/package.nix +++ b/packages/by-name/kata/runtime/package.nix @@ -57,93 +57,106 @@ buildGoModule (finalAttrs: { # No upstream patch available, changes first need to be discussed with Kata maintainers. # See https://katacontainers.slack.com/archives/C879ACQ00/p1731928491942299 ./0006-runtime-allow-initrd-AND-image-to-be-set.patch + ./0007-runtime-rs-allow-initrd-AND-image-to-be-set.patch # Simple genpolicy logging redaction of the policy annotation # This avoids printing the entire annotation on log level debug, which resulted in errors of the logtranslator.go # upstream didn't accept this patch: https://github.com/kata-containers/kata-containers/pull/10647 - ./0007-genpolicy-do-not-log-policy-annotation-in-debug.patch + ./0008-genpolicy-do-not-log-policy-annotation-in-debug.patch # Allow running generate with ephemeral volumes. # # This may be merged upstream through either of: # - https://github.com/kata-containers/kata-containers/pull/10947 (this patch) # - https://github.com/kata-containers/kata-containers/pull/10559 (superset including the patch) - ./0008-genpolicy-support-ephemeral-volume-source.patch + ./0009-genpolicy-support-ephemeral-volume-source.patch # Don't add storages for volumes declared in the image config. # This fixes a security issue where the host is able to write untrusted content to paths # under these volumes, by failing the policy generation if volumes without mounts are found. # Upstream issue: https://github.com/kata-containers/kata-containers/issues/11546. - ./0009-genpolicy-don-t-allow-mount-storage-for-declared-VOL.patch + ./0010-genpolicy-don-t-allow-mount-storage-for-declared-VOL.patch # Imagepulling has moved into the CDH in Kata 3.18.0. Since we are not using the CDH,we are instead starting our own Imagepuller. # This patch redirects calls by upstream's PullImage ttRPC client implementation to communicate with our imagepuller ttRPC server. # The patch should become unnecessary once the RFC for loose coupling of agents and guest components is implemented: # https://github.com/kata-containers/kata-containers/issues/11532 - ./0010-agent-use-custom-implementation-for-image-pulling.patch + ./0011-agent-use-custom-implementation-for-image-pulling.patch # Changes the unix socket used for ttRPC communication with the imagepuller. # Necessary to allow a separate imagestore service. # Can be removed in conjunction with patch 0018-agent-use-custom-implementation-for-image-pulling.patch. - ./0011-agent-use-separate-unix-socket-for-image-pulling.patch + ./0012-agent-use-separate-unix-socket-for-image-pulling.patch # Secure mounting is part of the CDH in Kata. Since we are not using the CDH, we are instead reimplementing it. # This patch redirects calls by upstream's SecureImageStore ttRPC client implementation to communicate with our own ttRPC server. # The patch should become unnecessary once the RFC for loose coupling of agents and guest components is implemented: # https://github.com/kata-containers/kata-containers/issues/11532 - ./0012-agent-use-custom-implementation-for-secure-mounting.patch + ./0013-agent-use-custom-implementation-for-secure-mounting.patch # Upstream expects guest pull to only use Nydus and applies workarounds that are not # necessary with force_guest_pull. This patch removes the workaround. # Upstream issue: https://github.com/kata-containers/kata-containers/issues/11757. - ./0013-genpolicy-don-t-apply-Nydus-workaround.patch + ./0014-genpolicy-don-t-apply-Nydus-workaround.patch # We're using a dedicated initdata-processor job and don't want the Kata agent to manage # initdata for us. # Upstream issue: https://github.com/kata-containers/kata-containers/issues/11532. - ./0014-agent-remove-initdata-processing.patch + ./0015-agent-remove-initdata-processing.patch # In addition to the initdata device, we also require the imagepuller's auth config # to be passed to the VM in a similar manner. - ./0015-runtime-pass-imagepuller-config-device-to-vm.patch + ./0016-runtime-pass-imagepuller-config-device-to-vm.patch # Privatemode requires GPU sharing between containers of the same pod. # In the hook-based flow, this worked because all devices and libs were (accidentally) handed to all containers. # With the CDI-based flow, this no longer happens. # Instead, this patch ensures that if a container has NVIDIA_VISIBLE_DEVICES=all set as an env var, # that container receives ALL Nvidia GPU devices known to the pod. - ./0016-runtime-assign-GPU-devices-to-multiple-containers.patch + ./0017-runtime-assign-GPU-devices-to-multiple-containers.patch # With recent versions of the sandbox-device-plugin, a /dev/iommu device is added # to the container spec for GPU-enabled containers. # Since the same thing is done by the CTK within the PodVM, and we only want this # to influence VM creation, we remove this device from the container spec in the agent. # Upstream bug: https://github.com/kata-containers/kata-containers/issues/12246. - ./0017-runtime-remove-iommu-device.patch + ./0018-runtime-remove-iommu-device.patch # We are observing frequent pull failures from genpolicy due to the connection being reset by the registry. # This patch allows genpolicy to retry these failed pulls multiple times. # Upstream PR: https://github.com/kata-containers/kata-containers/pull/12300. - ./0018-genpolicy-retry-failed-image-pulls.patch + ./0019-genpolicy-retry-failed-image-pulls.patch # In clusters that don't use the sandbox-device-plugin's P_GPU_ALIAS, we will not be able to # look up the device via PodResources. This patch adds additional resolution logic for that # case, relaxing the matching requirement to just the name (without vendor and class). # This is unlikely to be fixed in Kata upstream, but rather in the NVIDIA components. # Upstream issue: https://github.com/NVIDIA/sandbox-device-plugin/issues/46 - ./0019-shim-guess-CDI-devices-without-direct-match.patch + ./0020-shim-guess-CDI-devices-without-direct-match.patch # Kata takes a default_maxvcpus config option. Ordinarily, we could set this to 240 and do the same in the kernel commandline below. # However, kata then reduces this number to the actually available number of CPUs at runtime. # This is a problem for us because we need to know the precise kernel command line at buildtime. # TODO(charludo): attempt to make this behavior configurable upstream - ./0020-runtime-do-not-add-nr_vcpus-to-kernel-command-line.patch + ./0021-runtime-do-not-add-nr_vcpus-to-kernel-command-line.patch # Enables the Kata runtime to set the SNP ID blocks for the CPU model it is running on # based on Pod annotations. This allows us to run Pods with multiple CPUs. # This patch relies on changes made by 0001-emulate-CPU-model-that-most-closely-matches-the-host.patch # together with being specific to our use case. There are no plans to upstream it. - ./0021-runtime-add-SNP-ID-block-from-Pod-annotations.patch + ./0022-runtime-add-SNP-ID-block-from-Pod-annotations.patch + ./0023-runtime-rs-add-SNP-ID-block-from-Pod-annotations.patch + + # Deny unknown fields where possible to ease migration. This isn't possible where flatten is used. + # Upstream PR: https://github.com/kata-containers/kata-containers/pull/12756. + ./0024-runtime-rs-deny-unknown-fields-in-config.patch + + # Use virtio-blk with serial name for initdata + # Our initdata-processor expects the initdata device to be present at /dev/disks/by-label/initdata, + # which requires the device to have a stable name. Using virtio-blk with a serial number achieves this. + # TODO: check if we can improve the situation upstream or implement a fallback in the initdata-processor. + # Upstream issue: https://github.com/kata-containers/kata-containers/issues/12764. + ./0025-runtime-rs-force-virtio-blk-with-serial-name-for-ini.patch ]; }; diff --git a/packages/scripts.nix b/packages/scripts.nix index c8ee022758e..3a0c10564ed 100644 --- a/packages/scripts.nix +++ b/packages/scripts.nix @@ -487,13 +487,11 @@ lib.makeScope pkgs.newScope (scripts: { ''; }; - update-kata-configurations = writeShellApplication { - name = "update-kata-configurations"; - runtimeInputs = [ - (pkgs.buildGoModule { + update-kata-configurations = + let + update-kata-configurations = pkgs.buildGoModule { inherit (contrastPkgs.contrast.contrast) vendorHash; name = "nodeinstaller-kataconfig-update-testdata"; - src = let inherit (lib) fileset path hasSuffix; @@ -510,21 +508,34 @@ lib.makeScope pkgs.newScope (scripts: { (fileset.fileFilter (file: hasSuffix ".json" file.name) (path.append root "nodeinstaller")) ]; }; - proxyVendor = true; subPackages = [ "nodeinstaller/internal/kataconfig/update-testdata" ]; - env.CGO_ENABLED = 0; ldflags = [ "-s" ]; doCheck = false; - }) - pkgs.git - ]; - text = # bash - '' - update-testdata ${contrastPkgs.kata.release-tarball} "$(git rev-parse --show-toplevel)" - ''; - }; + }; + update-kata-configurations-rs = update-kata-configurations.overrideAttrs ( + _finalAttrs: prevAttrs: { + tags = prevAttrs.tags or [ ] ++ [ "runtimers" ]; + postInstall = prevAttrs.postInstall or "" + '' + mv $out/bin/update-testdata $out/bin/update-testdata-rs + ''; + } + ); + in + writeShellApplication { + name = "update-kata-configurations"; + runtimeInputs = [ + update-kata-configurations + update-kata-configurations-rs + pkgs.git + ]; + text = # bash + '' + update-testdata ${contrastPkgs.kata.release-tarball} "$(git rev-parse --show-toplevel)" + update-testdata-rs ${contrastPkgs.kata.release-tarball} "$(git rev-parse --show-toplevel)" + ''; + }; update-kata-protos = writeShellApplication { name = "update-kata-protos";