diff --git a/internal/controller/datadogagent/controller_v2_test.go b/internal/controller/datadogagent/controller_v2_test.go index 4941b0359d..61260bd9cf 100644 --- a/internal/controller/datadogagent/controller_v2_test.go +++ b/internal/controller/datadogagent/controller_v2_test.go @@ -1374,6 +1374,157 @@ func Test_AutopilotOverrides(t *testing.T) { runTestCases(t, tests, runFullReconcilerTest) } +// Test_COSProviderOverrides verifies that the GKE COS provider strips the +// `src` HostPath volume (and its system-probe mount) that oomkill and +// tcpqueuelength would otherwise add — the host has no /usr/src on COS nodes. +// The provider value flows from the DDA's `datadoghq.com/provider` annotation, +// or from the DAP's annotation propagated onto the per-profile DDAI. +func Test_COSProviderOverrides(t *testing.T) { + const resourcesName, resourcesNamespace = "foo", "bar" + const defaultDsName = "foo-agent" + const profileName = "cos-profile" + const profileDsName = "cos-profile-agent" + + defaultRequeueDuration := 15 * time.Second + + cosProfile := &v1alpha1.DatadogAgentProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: profileName, + Namespace: resourcesNamespace, + Annotations: map[string]string{ + kubernetes.ProviderAnnotationKey: kubernetes.GKECosProvider, + }, + }, + Spec: v1alpha1.DatadogAgentProfileSpec{ + ProfileAffinity: &v1alpha1.ProfileAffinity{ + ProfileNodeAffinity: []corev1.NodeSelectorRequirement{ + { + Key: "foo", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"cos-profile"}, + }, + }, + }, + // Config is required by the DAP webhook validator. We don't need + // any spec changes — the COS provider is signalled via the + // metadata.annotations propagated to the DDAI. + Config: &v2alpha1.DatadogAgentSpec{}, + }, + } + + // assertVolumes asserts the modules volume is always present (oomkill + + // tcpqueuelength add it unconditionally) and the src volume is present iff + // wantSrc is true. + assertVolumes := func(t *testing.T, c client.Client, ns, name string, wantSrc bool) { + t.Helper() + ds := &appsv1.DaemonSet{} + err := c.Get(context.TODO(), types.NamespacedName{Namespace: ns, Name: name}, ds) + assert.NoError(t, err, "Failed to get DaemonSet %s/%s", ns, name) + + var sp *corev1.Container + for i, ctn := range ds.Spec.Template.Spec.Containers { + if ctn.Name == string(apicommon.SystemProbeContainerName) { + sp = &ds.Spec.Template.Spec.Containers[i] + break + } + } + assert.NotNil(t, sp, "system-probe container not found on DaemonSet %s/%s", ns, name) + + hasModulesMount, hasSrcMount := false, false + for _, m := range sp.VolumeMounts { + if m.Name == common.ModulesVolumeName { + hasModulesMount = true + } + if m.Name == common.SrcVolumeName { + hasSrcMount = true + } + } + assert.True(t, hasModulesMount, "system-probe modules volume mount missing on %s/%s", ns, name) + assert.Equal(t, wantSrc, hasSrcMount, "system-probe src volume mount: want=%v got=%v on %s/%s", wantSrc, hasSrcMount, ns, name) + + hasModulesVol, hasSrcVol := false, false + for _, v := range ds.Spec.Template.Spec.Volumes { + if v.Name == common.ModulesVolumeName { + hasModulesVol = true + } + if v.Name == common.SrcVolumeName { + hasSrcVol = true + } + } + assert.True(t, hasModulesVol, "pod-level modules volume missing on %s/%s", ns, name) + assert.Equal(t, wantSrc, hasSrcVol, "pod-level src volume: want=%v got=%v on %s/%s", wantSrc, hasSrcVol, ns, name) + } + + // buildDDA returns a DDA with oomkill + tcpqueuelength enabled. Caller + // may layer annotations via opts. + buildDDA := func(annotations map[string]string) *v2alpha1.DatadogAgent { + b := testutils.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName). + WithOOMKillEnabled(true) + if len(annotations) > 0 { + b = b.WithAnnotations(annotations) + } + dda := b.Build() + dda.Spec.Features.TCPQueueLength = &v2alpha1.TCPQueueLengthFeatureConfig{ + Enabled: ptr.To(true), + } + return dda + } + + tests := []testCase{ + { + name: "[cos] baseline DDA no annotation: src volume present on default DS", + loadFunc: func(c client.Client) *v2alpha1.DatadogAgent { + dda := buildDDA(nil) + _ = c.Create(context.TODO(), dda) + return dda + }, + want: reconcile.Result{RequeueAfter: defaultRequeueDuration}, + wantErr: false, + wantFunc: func(t *testing.T, c client.Client) { + assertVolumes(t, c, resourcesNamespace, defaultDsName, true) + }, + }, + { + name: "[cos] DDA with gke-cos annotation strips src volume on default DS", + loadFunc: func(c client.Client) *v2alpha1.DatadogAgent { + dda := buildDDA(map[string]string{ + kubernetes.ProviderAnnotationKey: kubernetes.GKECosProvider, + }) + _ = c.Create(context.TODO(), dda) + return dda + }, + want: reconcile.Result{RequeueAfter: defaultRequeueDuration}, + wantErr: false, + wantFunc: func(t *testing.T, c client.Client) { + assertVolumes(t, c, resourcesNamespace, defaultDsName, false) + }, + }, + { + name: "[cos] DDA without annotation, DAP with gke-cos strips src on profile DS only", + clientBuilder: fake.NewClientBuilder(). + WithStatusSubresource(&v2alpha1.DatadogAgent{}, &v1alpha1.DatadogAgentProfile{}, &v1alpha1.DatadogAgentInternal{}). + WithObjects(cosProfile), + loadFunc: func(c client.Client) *v2alpha1.DatadogAgent { + dda := buildDDA(nil) + _ = c.Create(context.TODO(), dda) + return dda + }, + profile: cosProfile, + profilesEnabled: true, + want: reconcile.Result{RequeueAfter: defaultRequeueDuration}, + wantErr: false, + wantFunc: func(t *testing.T, c client.Client) { + // Profile DDAI inherited the DAP's COS annotation → src stripped. + assertVolumes(t, c, resourcesNamespace, profileDsName, false) + // Default DDAI has no provider annotation → src present. + assertVolumes(t, c, resourcesNamespace, defaultDsName, true) + }, + }, + } + + runTestCases(t, tests, runFullReconcilerTest) +} + func verifyDaemonsetContainers(t *testing.T, c client.Client, resourcesNamespace, dsName string, expectedContainers []string) { ds := &appsv1.DaemonSet{} err := c.Get(context.TODO(), types.NamespacedName{Namespace: resourcesNamespace, Name: dsName}, ds) diff --git a/internal/controller/datadogagent/feature/oomkill/feature.go b/internal/controller/datadogagent/feature/oomkill/feature.go index 7fe0e453f2..e4a34d5954 100644 --- a/internal/controller/datadogagent/feature/oomkill/feature.go +++ b/internal/controller/datadogagent/feature/oomkill/feature.go @@ -17,6 +17,8 @@ import ( "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/providercaps" + "github.com/DataDog/datadog-operator/pkg/kubernetes" ) func init() { @@ -34,6 +36,17 @@ func buildOOMKillFeature(options *feature.Options) feature.Feature { type oomKillFeature struct{} +// NodeAgentProviderCapabilities returns provider-conditional pod-template +// mutations for the node agent. On GKE COS, /usr/src does not exist on host +// nodes; strip the src volume + mounts so the pod schedules successfully. +func (f *oomKillFeature) NodeAgentProviderCapabilities() providercaps.NodeAgentProviderCapabilities { + return providercaps.NodeAgentProviderCapabilities{ + kubernetes.GKECosProvider: { + RemoveVolumes: []string{common.SrcVolumeName}, + }, + } +} + // ID returns the ID of the Feature func (f *oomKillFeature) ID() feature.IDType { return feature.OOMKillIDType @@ -81,7 +94,8 @@ func (f *oomKillFeature) ManageNodeAgent(managers feature.PodTemplateManagers) e managers.VolumeMount().AddVolumeMountToContainer(&modulesVolMount, apicommon.SystemProbeContainerName) managers.Volume().AddVolume(&modulesVol) - // src volume mount + // src volume mount — stripped on GKE COS by NodeAgentProviderCapabilities + // (host nodes have no /usr/src). srcVol, srcVolMount := volume.GetVolumes(common.SrcVolumeName, common.SrcVolumePath, common.SrcVolumePath, true) managers.VolumeMount().AddVolumeMountToContainer(&srcVolMount, apicommon.SystemProbeContainerName) managers.Volume().AddVolume(&srcVol) diff --git a/internal/controller/datadogagent/feature/tcpqueuelength/feature.go b/internal/controller/datadogagent/feature/tcpqueuelength/feature.go index f652683bb2..71bf901bc8 100644 --- a/internal/controller/datadogagent/feature/tcpqueuelength/feature.go +++ b/internal/controller/datadogagent/feature/tcpqueuelength/feature.go @@ -17,6 +17,8 @@ import ( "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/providercaps" + "github.com/DataDog/datadog-operator/pkg/kubernetes" ) func init() { @@ -34,6 +36,17 @@ func buildTCPQueueLengthFeature(options *feature.Options) feature.Feature { type tcpQueueLengthFeature struct{} +// NodeAgentProviderCapabilities returns provider-conditional pod-template +// mutations for the node agent. On GKE COS, /usr/src does not exist on host +// nodes; strip the src volume + mounts so the pod schedules successfully. +func (f *tcpQueueLengthFeature) NodeAgentProviderCapabilities() providercaps.NodeAgentProviderCapabilities { + return providercaps.NodeAgentProviderCapabilities{ + kubernetes.GKECosProvider: { + RemoveVolumes: []string{common.SrcVolumeName}, + }, + } +} + // ID returns the ID of the Feature func (f *tcpQueueLengthFeature) ID() feature.IDType { return feature.TCPQueueLengthIDType @@ -84,7 +97,8 @@ func (f *tcpQueueLengthFeature) ManageNodeAgent(managers feature.PodTemplateMana managers.VolumeMount().AddVolumeMountToContainer(&modulesVolMount, apicommon.SystemProbeContainerName) managers.Volume().AddVolume(&modulesVol) - // src volume mount + // src volume mount — stripped on GKE COS by NodeAgentProviderCapabilities + // (host nodes have no /usr/src). srcVol, srcVolMount := volume.GetVolumes(common.SrcVolumeName, common.SrcVolumePath, common.SrcVolumePath, true) managers.VolumeMount().AddVolumeMountToContainer(&srcVolMount, apicommon.SystemProbeContainerName) managers.Volume().AddVolume(&srcVol) diff --git a/internal/controller/datadogagent/profile.go b/internal/controller/datadogagent/profile.go index decda4abac..410e7fd38a 100644 --- a/internal/controller/datadogagent/profile.go +++ b/internal/controller/datadogagent/profile.go @@ -25,6 +25,7 @@ import ( "github.com/DataDog/datadog-operator/pkg/agentprofile" "github.com/DataDog/datadog-operator/pkg/constants" "github.com/DataDog/datadog-operator/pkg/controller/utils/comparison" + "github.com/DataDog/datadog-operator/pkg/kubernetes" ) func sendProfileEnabledMetric(enabled bool) { @@ -263,6 +264,16 @@ func setProfileDDAIMeta(ddai *v1alpha1.DatadogAgentInternal, profile *v1alpha1.D } ddai.Labels[constants.ProfileLabelKey] = profile.Name } + // Propagate the provider annotation from the profile onto the DDAI so a + // DAP can declare a provider that differs from the DDA (e.g. a GKE COS + // node pool selected by the profile). The profile value overrides the + // DDA-inherited value when set. + if v, ok := profile.GetAnnotations()[kubernetes.ProviderAnnotationKey]; ok { + if ddai.Annotations == nil { + ddai.Annotations = make(map[string]string) + } + ddai.Annotations[kubernetes.ProviderAnnotationKey] = v + } return nil } diff --git a/pkg/kubernetes/provider.go b/pkg/kubernetes/provider.go index daad33cf28..39073f3e85 100644 --- a/pkg/kubernetes/provider.go +++ b/pkg/kubernetes/provider.go @@ -32,6 +32,11 @@ const ( // GKECloudProvider GKE CloudProvider name GKECloudProvider = "gke" + // GKECosProvider is the full provider string for GKE on Container-Optimized OS + // nodes (matches the `{cloudProvider}-{value}` convention from + // generateValidProviderName). Used as a NodeAgentProviderCapabilities map key. + GKECosProvider = "gke-cos" + // GKEProviderLabel is the GKE node label used to determine the node's provider GKEProviderLabel = "cloud.google.com/gke-os-distribution"