diff --git a/pkg/operator/controllers/guardrails/config/config.go b/pkg/operator/controllers/guardrails/config/config.go index 794c81649c7..efb6cad6f9c 100644 --- a/pkg/operator/controllers/guardrails/config/config.go +++ b/pkg/operator/controllers/guardrails/config/config.go @@ -25,3 +25,7 @@ type GuardRailsDeploymentConfig struct { type GuardRailsPolicyConfig struct { Enforcement string } + +type GuardRailsVAPBindingConfig struct { + ValidationAction string +} diff --git a/pkg/operator/controllers/guardrails/guardrails_config.go b/pkg/operator/controllers/guardrails/guardrails_config.go index 73f420dcba3..073213ab445 100644 --- a/pkg/operator/controllers/guardrails/guardrails_config.go +++ b/pkg/operator/controllers/guardrails/guardrails_config.go @@ -67,6 +67,9 @@ const ( gkDeploymentPath = "staticresources" gkTemplatePath = "policies/gktemplates" gkConstraintsPath = "policies/gkconstraints" + + vapPolicyPath = "policies-vap/vap" + vapBindingPath = "policies-vap/vap-binding" ) //go:embed staticresources @@ -78,6 +81,12 @@ var gkPolicyTemplates embed.FS //go:embed policies/gkconstraints var gkPolicyConstraints embed.FS +//go:embed policies-vap/vap +var vapPolicies embed.FS + +//go:embed policies-vap/vap-binding +var vapBindings embed.FS + func (r *Reconciler) getDefaultDeployConfig(ctx context.Context, instance *arov1alpha1.Cluster) *config.GuardRailsDeploymentConfig { // apply the default value if the flag is empty or missing deployConfig := &config.GuardRailsDeploymentConfig{ @@ -149,6 +158,21 @@ func (r *Reconciler) VersionLT411(ctx context.Context) (bool, error) { return clusterVersion.Lt(ver411), nil } +func (r *Reconciler) VersionLT417(ctx context.Context) (bool, error) { + cv := &configv1.ClusterVersion{} + err := r.client.Get(ctx, types.NamespacedName{Name: "version"}, cv) + if err != nil { + return false, err + } + clusterVersion, err := version.GetClusterVersion(cv) + if err != nil { + r.log.Errorf("error getting the OpenShift version: %v", err) + return false, err + } + ver417, _ := version.ParseVersion("4.17.0") + return clusterVersion.Lt(ver417), nil +} + func (r *Reconciler) getGatekeeperDeployedNs(ctx context.Context, instance *arov1alpha1.Cluster) (string, error) { name := "" if r.kubernetescli == nil { diff --git a/pkg/operator/controllers/guardrails/guardrails_controller.go b/pkg/operator/controllers/guardrails/guardrails_controller.go index 651d236c7a1..7420ef37a96 100644 --- a/pkg/operator/controllers/guardrails/guardrails_controller.go +++ b/pkg/operator/controllers/guardrails/guardrails_controller.go @@ -39,7 +39,8 @@ type Reconciler struct { readinessTimeout time.Duration dh dynamichelper.Interface namespace string - policyTickerDone chan bool + gkTickerDone chan bool + vapTickerDone chan bool reconciliationMinutes int cleanupNeeded bool kubernetescli kubernetes.Interface @@ -64,12 +65,10 @@ func NewReconciler(log *logrus.Entry, client client.Client, dh dynamichelper.Int func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) { instance := &arov1alpha1.Cluster{} - err := r.client.Get(ctx, types.NamespacedName{Name: arov1alpha1.SingletonClusterName}, instance) - if err != nil { + if err := r.client.Get(ctx, types.NamespacedName{Name: arov1alpha1.SingletonClusterName}, instance); err != nil { return reconcile.Result{}, err } - // how to handle the enable/disable sequence of enabled and managed? if !instance.Spec.OperatorFlags.GetSimpleBoolean(operator.GuardrailsEnabled) { r.log.Debug("controller is disabled") return reconcile.Result{}, nil @@ -79,95 +78,130 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. managed := instance.Spec.OperatorFlags.GetWithDefault(operator.GuardrailsDeployManaged, "") - // If enabled and managed=true, install GuardRails - // If enabled and managed=false, remove the GuardRails deployment - // If enabled and managed is missing, do nothing - if strings.EqualFold(managed, "true") { - if ns, err := r.getGatekeeperDeployedNs(ctx, instance); err == nil && ns != "" { - r.log.Warnf("Found another GateKeeper deployed in ns %s, aborting Guardrails", ns) - return reconcile.Result{}, nil - } + lt417, err := r.VersionLT417(ctx) + if err != nil { + return reconcile.Result{}, err + } + + // managed=false: clean up whatever policy mechanism this version uses + if strings.EqualFold(managed, "false") { + return r.cleanupManaged(ctx, instance, lt417) + } + + // managed is blank/missing: no action + if !strings.EqualFold(managed, "true") { + r.log.Warnf("unrecognised managed flag (%s), doing nothing", managed) + return reconcile.Result{}, nil + } + + // Pre-4.17 clusters use the Gatekeeper / Rego workflow + if lt417 { + return r.deployGatekeeper(ctx, instance) + } - // Deploy the GateKeeper manifests and config - deployConfig := r.getDefaultDeployConfig(ctx, instance) - err = r.deployer.CreateOrUpdate(ctx, instance, deployConfig) - if err != nil { + // v4.17+ — migrate away from Gatekeeper if it is still running + if r.gatekeeperCleanupNeeded(ctx, instance) { + if err := r.cleanupGatekeeper(ctx, instance); err != nil { return reconcile.Result{}, err } + } - // Check Gatekeeper has become ready, wait up to readinessTimeout (default 5min) - timeoutCtx, cancel := context.WithTimeout(ctx, r.readinessTimeout) - defer cancel() + // Deploy VAP policies according to per-policy feature flags + if err := r.deployVAP(ctx, instance); err != nil { + return reconcile.Result{}, err + } - err := wait.PollImmediateUntil(r.readinessPollTime, func() (bool, error) { - return r.gatekeeperDeploymentIsReady(ctx, deployConfig) - }, timeoutCtx.Done()) - if err != nil { - return reconcile.Result{}, fmt.Errorf("GateKeeper deployment timed out on Ready: %w", err) - } - r.cleanupNeeded = true + r.startVAPTicker(ctx, instance) + + return reconcile.Result{}, nil +} + +// deployGatekeeper handles the managed=true path for clusters < v4.17. +func (r *Reconciler) deployGatekeeper(ctx context.Context, instance *arov1alpha1.Cluster) (ctrl.Result, error) { + if ns, err := r.getGatekeeperDeployedNs(ctx, instance); err == nil && ns != "" { + r.log.Warnf("Found another GateKeeper deployed in ns %s, aborting Guardrails", ns) + return reconcile.Result{}, nil + } + + deployConfig := r.getDefaultDeployConfig(ctx, instance) + if err := r.deployer.CreateOrUpdate(ctx, instance, deployConfig); err != nil { + return reconcile.Result{}, err + } + + timeoutCtx, cancel := context.WithTimeout(ctx, r.readinessTimeout) + defer cancel() + + if err := wait.PollImmediateUntil(r.readinessPollTime, func() (bool, error) { + return r.gatekeeperDeploymentIsReady(ctx, deployConfig) + }, timeoutCtx.Done()); err != nil { + return reconcile.Result{}, fmt.Errorf("GateKeeper deployment timed out on Ready: %w", err) + } + + r.cleanupNeeded = true + + if r.gkPolicyTemplate != nil { policyConfig := &config.GuardRailsPolicyConfig{} - if r.gkPolicyTemplate != nil { - // Deploy the GateKeeper ConstraintTemplate - err = r.gkPolicyTemplate.CreateOrUpdate(ctx, instance, policyConfig) - if err != nil { - return reconcile.Result{}, err - } - - err := wait.PollImmediateUntil(r.readinessPollTime, func() (bool, error) { - return r.gkPolicyTemplate.IsConstraintTemplateReady(ctx, policyConfig) - }, timeoutCtx.Done()) - if err != nil { - return reconcile.Result{}, fmt.Errorf("GateKeeper ConstraintTemplates timed out on creation: %w", err) - } - - // Deploy the GateKeeper Constraint - err = r.ensurePolicy(ctx, gkPolicyConstraints, gkConstraintsPath) - if err != nil { - return reconcile.Result{}, err - } + + if err := r.gkPolicyTemplate.CreateOrUpdate(ctx, instance, policyConfig); err != nil { + return reconcile.Result{}, err } - // start a ticker to re-enforce gatekeeper policies periodically - r.startTicker(ctx, instance) - } else if strings.EqualFold(managed, "false") { - if !r.cleanupNeeded { - if ns, err := r.getGatekeeperDeployedNs(ctx, instance); err == nil && ns != "" { - // resources were *not* created by guardrails, plus another gatekeeper deployed - // - // guardrails didn't get deployed most likely due to another gatekeeper is deployed by customer - // this is to avoid the accidental deletion of gatekeeper CRDs that were deployed by customer - // the unnamespaced gatekeeper CRDs were possibly created by a customised gatekeeper, hence cannot ramdomly delete them. - r.log.Warn("Skipping cleanup as it is not safe and may destroy customer's gatekeeper resources") - return reconcile.Result{}, nil - } + if err := wait.PollImmediateUntil(r.readinessPollTime, func() (bool, error) { + return r.gkPolicyTemplate.IsConstraintTemplateReady(ctx, policyConfig) + }, timeoutCtx.Done()); err != nil { + return reconcile.Result{}, fmt.Errorf("GateKeeper ConstraintTemplates timed out on creation: %w", err) } - if r.gkPolicyTemplate != nil { - // stop the gatekeeper policies re-enforce ticker - r.stopTicker() + if err := r.ensurePolicy(ctx, gkPolicyConstraints, gkConstraintsPath); err != nil { + return reconcile.Result{}, err + } + } - err = r.removePolicy(ctx, gkPolicyConstraints, gkConstraintsPath) - if err != nil { - r.log.Warnf("failed to remove Constraints with error %s", err.Error()) - } + r.startGKTicker(ctx, instance) + return reconcile.Result{}, nil +} - err = r.gkPolicyTemplate.Remove(ctx, config.GuardRailsPolicyConfig{}) - if err != nil { - r.log.Warnf("failed to remove ConstraintTemplates with error %s", err.Error()) - } - } - err = r.deployer.Remove(ctx, config.GuardRailsDeploymentConfig{Namespace: r.namespace}) - if err != nil { - r.log.Warnf("failed to remove deployer with error %s", err.Error()) +// cleanupManaged handles the managed=false path. The resources to remove +// depend on the cluster version: pre-4.17 uses Gatekeeper, 4.17+ uses VAP +// (and may also need leftover Gatekeeper resources removed). +func (r *Reconciler) cleanupManaged(ctx context.Context, instance *arov1alpha1.Cluster, lt417 bool) (ctrl.Result, error) { + if lt417 { + return r.cleanupGatekeeperManaged(ctx, instance) + } + + // v4.17+: remove VAP policies + r.stopVAPTicker() + if err := r.removeAllVAP(ctx); err != nil { + r.log.Warnf("failed to remove VAP policies: %s", err.Error()) + } + + // also clean up Gatekeeper if it is still present (upgrade scenario) + if r.gatekeeperCleanupNeeded(ctx, instance) { + if err := r.cleanupGatekeeper(ctx, instance); err != nil { return reconcile.Result{}, err } - r.cleanupNeeded = false } return reconcile.Result{}, nil } +// cleanupGatekeeperManaged is the managed=false cleanup for pre-4.17 clusters. +// It preserves the safety check that avoids destroying a customer-deployed +// Gatekeeper in a different namespace, then delegates to cleanupGatekeeper. +func (r *Reconciler) cleanupGatekeeperManaged(ctx context.Context, instance *arov1alpha1.Cluster) (ctrl.Result, error) { + if !r.cleanupNeeded { + if ns, err := r.getGatekeeperDeployedNs(ctx, instance); err == nil && ns != "" { + r.log.Warn("Skipping cleanup as it is not safe and may destroy customer's gatekeeper resources") + return reconcile.Result{}, nil + } + } + + if err := r.cleanupGatekeeper(ctx, instance); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil +} + // SetupWithManager setup our manager func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { grBuilder := ctrl.NewControllerManagedBy(mgr). diff --git a/pkg/operator/controllers/guardrails/guardrails_controller_test.go b/pkg/operator/controllers/guardrails/guardrails_controller_test.go index ad80a27b4b5..60ce8c30caf 100644 --- a/pkg/operator/controllers/guardrails/guardrails_controller_test.go +++ b/pkg/operator/controllers/guardrails/guardrails_controller_test.go @@ -17,20 +17,37 @@ import ( ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/reconcile" + configv1 "github.com/openshift/api/config/v1" + "github.com/Azure/ARO-RP/pkg/operator" arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" "github.com/Azure/ARO-RP/pkg/operator/controllers/guardrails/config" mock_deployer "github.com/Azure/ARO-RP/pkg/util/mocks/deployer" + mock_dynamichelper "github.com/Azure/ARO-RP/pkg/util/mocks/dynamichelper" + _ "github.com/Azure/ARO-RP/pkg/util/scheme" ) -func TestGuardRailsReconciler(t *testing.T) { +func clusterVersionForTest(version string) *configv1.ClusterVersion { + return &configv1.ClusterVersion{ + ObjectMeta: metav1.ObjectMeta{Name: "version"}, + Status: configv1.ClusterVersionStatus{ + History: []configv1.UpdateHistory{ + {State: configv1.CompletedUpdate, Version: version}, + }, + }, + } +} + +func TestGuardRailsReconcilerGatekeeper(t *testing.T) { + cv := clusterVersionForTest("4.16.0") + tests := []struct { name string mocks func(*mock_deployer.MockDeployer, *arov1alpha1.Cluster) + dhMocks func(*mock_dynamichelper.MockInterface) flags arov1alpha1.OperatorFlags cleanupNeeded bool - // errors - wantErr string + wantErr string }{ { name: "disabled", @@ -161,6 +178,9 @@ func TestGuardRailsReconciler(t *testing.T) { mocks: func(md *mock_deployer.MockDeployer, cluster *arov1alpha1.Cluster) { md.EXPECT().Remove(gomock.Any(), gomock.Any()).Return(nil) }, + dhMocks: func(dh *mock_dynamichelper.MockInterface) { + dh.EXPECT().EnsureDeletedGVR(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + }, }, { name: "managed=false (removal), Remove() fails", @@ -173,7 +193,10 @@ func TestGuardRailsReconciler(t *testing.T) { mocks: func(md *mock_deployer.MockDeployer, cluster *arov1alpha1.Cluster) { md.EXPECT().Remove(gomock.Any(), gomock.Any()).Return(errors.New("failed delete")) }, - wantErr: "failed delete", + dhMocks: func(dh *mock_dynamichelper.MockInterface) { + dh.EXPECT().EnsureDeletedGVR(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + }, + wantErr: "failed to remove Gatekeeper deployment: failed delete", }, { name: "managed=blank (no action)", @@ -202,16 +225,21 @@ func TestGuardRailsReconciler(t *testing.T) { ACRDomain: "acrtest.example.com", }, } - deployer := mock_deployer.NewMockDeployer(controller) - clientBuilder := ctrlfake.NewClientBuilder().WithObjects(cluster) + dep := mock_deployer.NewMockDeployer(controller) + dh := mock_dynamichelper.NewMockInterface(controller) + clientBuilder := ctrlfake.NewClientBuilder().WithObjects(cluster, cv) if tt.mocks != nil { - tt.mocks(deployer, cluster) + tt.mocks(dep, cluster) + } + if tt.dhMocks != nil { + tt.dhMocks(dh) } r := &Reconciler{ log: logrus.NewEntry(logrus.StandardLogger()), - deployer: deployer, + deployer: dep, + dh: dh, client: clientBuilder.Build(), readinessTimeout: 0 * time.Second, readinessPollTime: 1 * time.Second, @@ -228,3 +256,150 @@ func TestGuardRailsReconciler(t *testing.T) { }) } } + +func TestReconcileVAP(t *testing.T) { + cv := clusterVersionForTest("4.17.0") + + tests := []struct { + name string + flags arov1alpha1.OperatorFlags + depMocks func(*mock_deployer.MockDeployer) + dhMocks func(*mock_dynamichelper.MockInterface) + cleanupNeeded bool + wantErr string + }{ + { + name: "VAP: disabled", + flags: arov1alpha1.OperatorFlags{ + operator.GuardrailsEnabled: operator.FlagFalse, + operator.GuardrailsDeployManaged: operator.FlagTrue, + }, + }, + { + name: "VAP: managed=true, deploys VAP policies", + flags: arov1alpha1.OperatorFlags{ + operator.GuardrailsEnabled: operator.FlagTrue, + operator.GuardrailsDeployManaged: operator.FlagTrue, + operator.GuardrailsPolicyMachineDenyManaged: operator.FlagTrue, + operator.GuardrailsPolicyMachineDenyEnforcement: operator.GuardrailsPolicyDeny, + operator.GuardrailsPolicyMachineConfigDenyManaged: operator.FlagTrue, + operator.GuardrailsPolicyMachineConfigDenyEnforcement: operator.GuardrailsPolicyDryrun, + operator.GuardrailsPolicyPrivNamespaceDenyManaged: operator.FlagTrue, + operator.GuardrailsPolicyPrivNamespaceDenyEnforcement: operator.GuardrailsPolicyWarn, + }, + dhMocks: func(dh *mock_dynamichelper.MockInterface) { + // 3 policies + 3 bindings = 6 Ensure calls (server-side apply) + dh.EXPECT().Ensure(gomock.Any(), gomock.Any()).Return(nil).Times(6) + }, + }, + { + name: "VAP: managed=true with gatekeeper migration (upgrade from pre-4.17)", + flags: arov1alpha1.OperatorFlags{ + operator.GuardrailsEnabled: operator.FlagTrue, + operator.GuardrailsDeployManaged: operator.FlagTrue, + operator.GuardrailsPolicyMachineDenyManaged: operator.FlagTrue, + operator.GuardrailsPolicyMachineDenyEnforcement: operator.GuardrailsPolicyDeny, + operator.GuardrailsPolicyMachineConfigDenyManaged: operator.FlagTrue, + operator.GuardrailsPolicyMachineConfigDenyEnforcement: operator.GuardrailsPolicyDryrun, + operator.GuardrailsPolicyPrivNamespaceDenyManaged: operator.FlagTrue, + operator.GuardrailsPolicyPrivNamespaceDenyEnforcement: operator.GuardrailsPolicyWarn, + }, + cleanupNeeded: true, + depMocks: func(md *mock_deployer.MockDeployer) { + md.EXPECT().Remove(gomock.Any(), gomock.Any()).Return(nil) + }, + dhMocks: func(dh *mock_dynamichelper.MockInterface) { + // cleanupGatekeeper: removePolicy calls EnsureDeletedGVR for GK constraints + // then deployVAP: 3 Ensure (policy) + 3 EnsureDeletedGVR (binding delete) + 3 Ensure (binding recreate) + dh.EXPECT().EnsureDeletedGVR(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + dh.EXPECT().Ensure(gomock.Any(), gomock.Any()).Return(nil).Times(6) + }, + }, + { + name: "VAP: managed=false, removes all policies", + flags: arov1alpha1.OperatorFlags{ + operator.GuardrailsEnabled: operator.FlagTrue, + operator.GuardrailsDeployManaged: operator.FlagFalse, + }, + dhMocks: func(dh *mock_dynamichelper.MockInterface) { + // 3 policies × (binding delete + policy delete) = 6 deletions + dh.EXPECT().EnsureDeletedGVR(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).Times(6) + }, + }, + { + name: "VAP: managed=blank, no action", + flags: arov1alpha1.OperatorFlags{ + operator.GuardrailsEnabled: operator.FlagTrue, + operator.GuardrailsDeployManaged: "", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + controller := gomock.NewController(t) + defer controller.Finish() + + cluster := &arov1alpha1.Cluster{ + TypeMeta: metav1.TypeMeta{ + Kind: "Cluster", + APIVersion: "aro.openshift.io/v1alpha1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: arov1alpha1.SingletonClusterName, + }, + Spec: arov1alpha1.ClusterSpec{ + OperatorFlags: tt.flags, + }, + } + + dep := mock_deployer.NewMockDeployer(controller) + dh := mock_dynamichelper.NewMockInterface(controller) + if tt.depMocks != nil { + tt.depMocks(dep) + } + if tt.dhMocks != nil { + tt.dhMocks(dh) + } + + r := &Reconciler{ + log: logrus.NewEntry(logrus.StandardLogger()), + deployer: dep, + client: ctrlfake.NewClientBuilder().WithObjects(cluster, cv).Build(), + dh: dh, + cleanupNeeded: tt.cleanupNeeded, + } + _, err := r.Reconcile(context.Background(), reconcile.Request{}) + if err != nil && err.Error() != tt.wantErr { + t.Errorf("got error '%v', wanted error '%v'", err, tt.wantErr) + } + + if err == nil && tt.wantErr != "" { + t.Errorf("did not get an error, but wanted error '%v'", tt.wantErr) + } + }) + } +} + +func TestVapValidationAction(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"deny", "Deny"}, + {"Deny", "Deny"}, + {"warn", "Warn"}, + {"Warn", "Warn"}, + {"dryrun", "Audit"}, + {"DryRun", "Audit"}, + {"", "Warn"}, + {"unknown", "Warn"}, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got := vapValidationAction(tt.input) + if got != tt.want { + t.Errorf("vapValidationAction(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} diff --git a/pkg/operator/controllers/guardrails/guardrails_policy.go b/pkg/operator/controllers/guardrails/guardrails_policy.go index 35ef4455339..83ef3a317ad 100644 --- a/pkg/operator/controllers/guardrails/guardrails_policy.go +++ b/pkg/operator/controllers/guardrails/guardrails_policy.go @@ -16,6 +16,7 @@ import ( "time" kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -114,8 +115,50 @@ func (r *Reconciler) removePolicy(ctx context.Context, fs embed.FS, path string) return nil } -func (r *Reconciler) policyTicker(ctx context.Context, instance *arov1alpha1.Cluster) { - r.policyTickerDone = make(chan bool) +// gatekeeperCleanupNeeded returns true if there are Gatekeeper resources on the +// cluster that should be removed as part of the migration to VAP. This covers +// the upgrade scenario where Gatekeeper was deployed on a pre-4.17 cluster. +func (r *Reconciler) gatekeeperCleanupNeeded(ctx context.Context, instance *arov1alpha1.Cluster) bool { + if r.cleanupNeeded { + return true + } + if r.kubernetescli == nil { + return false + } + ns := instance.Spec.OperatorFlags.GetWithDefault(controllerNamespace, defaultNamespace) + _, err := r.kubernetescli.AppsV1().Deployments(ns).Get(ctx, "gatekeeper-audit", metav1.GetOptions{}) + return err == nil +} + +// cleanupGatekeeper removes all Gatekeeper resources: constraints, constraint +// templates, and the deployment itself. It is safe to call when no Gatekeeper +// resources exist. +func (r *Reconciler) cleanupGatekeeper(ctx context.Context, instance *arov1alpha1.Cluster) error { + r.log.Info("cleaning up Gatekeeper resources after upgrade to v4.17+") + + r.stopGKTicker() + + if err := r.removePolicy(ctx, gkPolicyConstraints, gkConstraintsPath); err != nil { + r.log.Warnf("failed to remove Gatekeeper constraints: %s", err.Error()) + } + + if r.gkPolicyTemplate != nil { + if err := r.gkPolicyTemplate.Remove(ctx, config.GuardRailsPolicyConfig{}); err != nil { + r.log.Warnf("failed to remove Gatekeeper ConstraintTemplates: %s", err.Error()) + } + } + + ns := instance.Spec.OperatorFlags.GetWithDefault(controllerNamespace, defaultNamespace) + if err := r.deployer.Remove(ctx, config.GuardRailsDeploymentConfig{Namespace: ns}); err != nil { + return fmt.Errorf("failed to remove Gatekeeper deployment: %w", err) + } + + r.cleanupNeeded = false + return nil +} + +func (r *Reconciler) gkTicker(ctx context.Context, instance *arov1alpha1.Cluster) { + r.gkTickerDone = make(chan bool) var err error minutes := instance.Spec.OperatorFlags.GetWithDefault(controllerReconciliationMinutes, defaultReconciliationMinutes) @@ -128,44 +171,44 @@ func (r *Reconciler) policyTicker(ctx context.Context, instance *arov1alpha1.Clu defer ticker.Stop() for { select { - case done := <-r.policyTickerDone: + case done := <-r.gkTickerDone: if done { - r.policyTickerDone = nil + r.gkTickerDone = nil return } // false to trigger a ticker reset - r.log.Infof("policyTicker reset to %d min", r.reconciliationMinutes) + r.log.Infof("gkTicker reset to %d min", r.reconciliationMinutes) ticker.Reset(time.Duration(r.reconciliationMinutes) * time.Minute) case <-ticker.C: err = r.ensurePolicy(ctx, gkPolicyConstraints, gkConstraintsPath) if err != nil { - r.log.Errorf("policyTicker ensurePolicy error %s", err.Error()) + r.log.Errorf("gkTicker ensurePolicy error %s", err.Error()) } } } } -func (r *Reconciler) startTicker(ctx context.Context, instance *arov1alpha1.Cluster) { +func (r *Reconciler) startGKTicker(ctx context.Context, instance *arov1alpha1.Cluster) { minutes := instance.Spec.OperatorFlags.GetWithDefault(controllerReconciliationMinutes, defaultReconciliationMinutes) min, err := strconv.Atoi(minutes) if err != nil { min, _ = strconv.Atoi(defaultReconciliationMinutes) } - if r.reconciliationMinutes != min && r.policyTickerDone != nil { + if r.reconciliationMinutes != min && r.gkTickerDone != nil { // trigger ticker reset r.reconciliationMinutes = min - r.policyTickerDone <- false + r.gkTickerDone <- false } // make sure only one ticker started - if r.policyTickerDone == nil { - go r.policyTicker(ctx, instance) + if r.gkTickerDone == nil { + go r.gkTicker(ctx, instance) } } -func (r *Reconciler) stopTicker() { - if r.policyTickerDone != nil { - r.policyTickerDone <- true - close(r.policyTickerDone) +func (r *Reconciler) stopGKTicker() { + if r.gkTickerDone != nil { + r.gkTickerDone <- true + close(r.gkTickerDone) } } diff --git a/pkg/operator/controllers/guardrails/guardrails_vap.go b/pkg/operator/controllers/guardrails/guardrails_vap.go new file mode 100644 index 00000000000..2db7552cf15 --- /dev/null +++ b/pkg/operator/controllers/guardrails/guardrails_vap.go @@ -0,0 +1,205 @@ +package guardrails + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "bytes" + "context" + "fmt" + "io/fs" + "path/filepath" + "strconv" + "strings" + "text/template" + "time" + + arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" + "github.com/Azure/ARO-RP/pkg/operator/controllers/guardrails/config" + "github.com/Azure/ARO-RP/pkg/util/dynamichelper" +) + +// vapValidationAction maps a Gatekeeper-style enforcement action to the +// equivalent VAP validationAction. +func vapValidationAction(gkEnforcement string) string { + switch strings.ToLower(gkEnforcement) { + case "deny": + return "Deny" + case "warn": + return "Warn" + case "dryrun", "audit": + return "Audit" + default: + // default to warn + return "Warn" + } +} + +// deployVAP creates or updates ValidatingAdmissionPolicy and +// ValidatingAdmissionPolicyBinding resources based on per-policy operator flags. +func (r *Reconciler) deployVAP(ctx context.Context, instance *arov1alpha1.Cluster) error { + r.log.Info("reconciling VAP policies") + + entries, err := fs.ReadDir(vapPolicies, vapPolicyPath) + if err != nil { + return fmt.Errorf("reading VAP policy directory: %w", err) + } + + for _, entry := range entries { + if entry.IsDir() { + continue + } + policyName := strings.TrimSuffix(entry.Name(), filepath.Ext(entry.Name())) + + managed, enforcement, err := r.getPolicyConfig(ctx, instance, entry.Name()) + if err != nil { + return err + } + + if !strings.EqualFold(managed, "true") { + if err := r.removeVAPPolicy(ctx, policyName); err != nil { + r.log.Warnf("failed to remove unmanaged VAP policy %s: %s", policyName, err.Error()) + } + continue + } + + if err := r.ensureVAPPolicy(ctx, entry.Name()); err != nil { + return fmt.Errorf("ensuring VAP policy %s: %w", policyName, err) + } + + validationAction := vapValidationAction(enforcement) + if err := r.ensureVAPBinding(ctx, policyName, validationAction); err != nil { + return fmt.Errorf("ensuring VAP binding for %s: %w", policyName, err) + } + } + + return nil +} + +// ensureVAPPolicy creates or updates the ValidatingAdmissionPolicy. +// The dynamic helper's Ensure method detects native Kubernetes resources +// and uses server-side apply, so all policy fields are correctly reconciled. +func (r *Reconciler) ensureVAPPolicy(ctx context.Context, filename string) error { + data, err := fs.ReadFile(vapPolicies, filepath.Join(vapPolicyPath, filename)) + if err != nil { + return err + } + uns, err := dynamichelper.DecodeUnstructured(data) + if err != nil { + return err + } + return r.dh.Ensure(ctx, uns) +} + +// ensureVAPBinding creates or updates the ValidatingAdmissionPolicyBinding. +// The dynamic helper's Ensure method detects native Kubernetes resources +// and uses server-side apply, so validationActions changes are applied +// atomically without needing a delete-then-create workaround. +func (r *Reconciler) ensureVAPBinding(ctx context.Context, policyName, validationAction string) error { + bindingFile := policyName + "-binding.yaml" + tmpl, err := template.ParseFS(vapBindings, filepath.Join(vapBindingPath, bindingFile)) + if err != nil { + return err + } + + buf := new(bytes.Buffer) + cfg := &config.GuardRailsVAPBindingConfig{ + ValidationAction: validationAction, + } + for _, t := range tmpl.Templates() { + if err := t.Execute(buf, cfg); err != nil { + return err + } + } + + uns, err := dynamichelper.DecodeUnstructured(buf.Bytes()) + if err != nil { + return err + } + + return r.dh.Ensure(ctx, uns) +} + +// removeVAPPolicy removes both the ValidatingAdmissionPolicyBinding and the +// ValidatingAdmissionPolicy for a given policy name. +func (r *Reconciler) removeVAPPolicy(ctx context.Context, policyName string) error { + bindingGK := "ValidatingAdmissionPolicyBinding.admissionregistration.k8s.io" + if err := r.dh.EnsureDeletedGVR(ctx, bindingGK, "", policyName+"-binding", "v1"); err != nil { + r.log.Warnf("failed to remove VAP binding %s-binding: %s", policyName, err.Error()) + } + + policyGK := "ValidatingAdmissionPolicy.admissionregistration.k8s.io" + if err := r.dh.EnsureDeletedGVR(ctx, policyGK, "", policyName, "v1"); err != nil { + r.log.Warnf("failed to remove VAP policy %s: %s", policyName, err.Error()) + } + return nil +} + +// removeAllVAP removes every VAP policy and binding known to the controller. +func (r *Reconciler) removeAllVAP(ctx context.Context) error { + entries, err := fs.ReadDir(vapPolicies, vapPolicyPath) + if err != nil { + return fmt.Errorf("reading VAP policy directory: %w", err) + } + + for _, entry := range entries { + if entry.IsDir() { + continue + } + policyName := strings.TrimSuffix(entry.Name(), filepath.Ext(entry.Name())) + if err := r.removeVAPPolicy(ctx, policyName); err != nil { + r.log.Warnf("failed to remove VAP policy %s: %s", policyName, err.Error()) + } + } + return nil +} + +// vapTicker periodically re-applies VAP policies and bindings to prevent them from being externally deleted. +func (r *Reconciler) vapTicker(ctx context.Context, instance *arov1alpha1.Cluster) { + r.vapTickerDone = make(chan bool) + var err error + + minutes := instance.Spec.OperatorFlags.GetWithDefault(controllerReconciliationMinutes, defaultReconciliationMinutes) + reconciliationMinutes, err := strconv.Atoi(minutes) + if err != nil { + reconciliationMinutes, _ = strconv.Atoi(defaultReconciliationMinutes) + } + + ticker := time.NewTicker(time.Duration(reconciliationMinutes) * time.Minute) + defer ticker.Stop() + for { + select { + case done := <-r.vapTickerDone: + if done { + r.vapTickerDone = nil + return + } + // false to trigger a ticker reset + minutes = instance.Spec.OperatorFlags.GetWithDefault(controllerReconciliationMinutes, defaultReconciliationMinutes) + reconciliationMinutes, err = strconv.Atoi(minutes) + if err != nil { + reconciliationMinutes, _ = strconv.Atoi(defaultReconciliationMinutes) + } + r.log.Infof("vapTicker reset to %d min", reconciliationMinutes) + ticker.Reset(time.Duration(reconciliationMinutes) * time.Minute) + case <-ticker.C: + err = r.deployVAP(ctx, instance) + if err != nil { + r.log.Errorf("vapTicker deployVAP error %s", err.Error()) + } + } + } +} + +func (r *Reconciler) startVAPTicker(ctx context.Context, instance *arov1alpha1.Cluster) { + if r.vapTickerDone == nil { + go r.vapTicker(ctx, instance) + } +} + +func (r *Reconciler) stopVAPTicker() { + if r.vapTickerDone != nil { + r.vapTickerDone <- true + close(r.vapTickerDone) + } +} diff --git a/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-machine-config-deny-binding.yaml b/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-machine-config-deny-binding.yaml new file mode 100644 index 00000000000..0185773a676 --- /dev/null +++ b/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-machine-config-deny-binding.yaml @@ -0,0 +1,8 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: aro-machine-config-deny-binding +spec: + policyName: aro-machine-config-deny + validationActions: + - {{.ValidationAction}} diff --git a/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-machines-deny-binding.yaml b/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-machines-deny-binding.yaml new file mode 100644 index 00000000000..d1187aa5cad --- /dev/null +++ b/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-machines-deny-binding.yaml @@ -0,0 +1,14 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: aro-machines-deny-binding +spec: + policyName: aro-machines-deny + validationActions: + - {{.ValidationAction}} + matchResources: + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: In + values: ["openshift-machine-api"] diff --git a/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-privileged-namespace-deny-binding.yaml b/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-privileged-namespace-deny-binding.yaml new file mode 100644 index 00000000000..47e97bae49c --- /dev/null +++ b/pkg/operator/controllers/guardrails/policies-vap/vap-binding/aro-privileged-namespace-deny-binding.yaml @@ -0,0 +1,8 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: aro-privileged-namespace-deny-binding +spec: + policyName: aro-privileged-namespace-deny + validationActions: + - {{.ValidationAction}} diff --git a/pkg/operator/controllers/guardrails/policies-vap/vap/aro-machine-config-deny.yaml b/pkg/operator/controllers/guardrails/policies-vap/vap/aro-machine-config-deny.yaml new file mode 100644 index 00000000000..1deb096cffb --- /dev/null +++ b/pkg/operator/controllers/guardrails/policies-vap/vap/aro-machine-config-deny.yaml @@ -0,0 +1,64 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: aro-machine-config-deny +spec: + failurePolicy: Ignore + matchConstraints: + resourceRules: + - apiGroups: ["machineconfiguration.openshift.io"] + apiVersions: ["*"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["machineconfigs"] + variables: + - name: protectedMachineConfigs + expression: | + [ + "00-master", + "00-worker", + "01-master-container-runtime", + "01-master-kubelet", + "01-worker-container-runtime", + "01-worker-kubelet", + "90-aro-worker-registries", + "97-master-generated-kubelet", + "97-worker-generated-kubelet", + "98-master-generated-kubelet", + "98-worker-generated-kubelet", + "99-master-aro-dns", + "99-master-aro-etc-hosts-gateway-domains", + "99-master-generated-kubelet", + "99-master-generated-registries", + "99-master-ssh", + "99-worker-aro-dns", + "99-worker-aro-etc-hosts-gateway-domains", + "99-worker-generated-kubelet", + "99-worker-generated-registries", + "99-worker-ssh" + ] + - name: exemptedUsers + expression: | + ["system:kube-controller-manager", "system:kube-scheduler", "system:admin", "system:aro-service"] + - name: mcName + expression: | + request.operation == "DELETE" + ? (has(oldObject.metadata) && has(oldObject.metadata.name) ? oldObject.metadata.name : "") + : (has(object.metadata) && has(object.metadata.name) ? object.metadata.name : "") + - name: isProtectedMC + expression: | + variables.mcName in variables.protectedMachineConfigs || + variables.mcName.matches("^rendered-(master|worker)-.+$") + - name: isExempted + expression: | + !has(request.userInfo) || + !has(request.userInfo.username) || + request.userInfo.username in variables.exemptedUsers || + (has(request.userInfo.groups) && request.userInfo.groups.exists(g, + g.contains("system:node") || g.contains("system:serviceaccount") || g.contains("system:masters") + )) + validations: + - expression: | + !variables.isProtectedMC || variables.isExempted + message: "Operation not allowed on protected MachineConfig" + messageExpression: | + "user " + (has(request.userInfo.username) ? request.userInfo.username : "unknown") + " not allowed to " + request.operation + " machine config " + variables.mcName diff --git a/pkg/operator/controllers/guardrails/policies-vap/vap/aro-machines-deny.yaml b/pkg/operator/controllers/guardrails/policies-vap/vap/aro-machines-deny.yaml new file mode 100644 index 00000000000..1898a935519 --- /dev/null +++ b/pkg/operator/controllers/guardrails/policies-vap/vap/aro-machines-deny.yaml @@ -0,0 +1,47 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: aro-machines-deny +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: ["machine.openshift.io"] + apiVersions: ["*"] + operations: ["DELETE"] + resources: ["machines"] + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: In + values: ["openshift-machine-api"] + variables: + - name: denyLabelKey + expression: | + "machine.openshift.io/cluster-api-machine-role" + - name: denyLabelRegex + expression: | + "master" + - name: exemptedUsers + expression: | + ["system:kube-controller-manager", "system:kube-scheduler", "system:admin", "system:aro-service"] + - name: isExempted + expression: | + !has(request.userInfo) || + !has(request.userInfo.username) || + request.userInfo.username in variables.exemptedUsers || + (has(request.userInfo.groups) && request.userInfo.groups.exists(g, + g.contains("system:node") || g.contains("system:serviceaccount") || g.contains("system:masters") + )) + - name: isMasterMachine + expression: | + has(oldObject.metadata) && + has(oldObject.metadata.labels) && + variables.denyLabelKey in oldObject.metadata.labels && + oldObject.metadata.labels[variables.denyLabelKey].matches(variables.denyLabelRegex) + validations: + - expression: | + !variables.isMasterMachine || variables.isExempted + message: "Deletion of master machines is not allowed" + messageExpression: | + "user <" + (has(request.userInfo.username) ? request.userInfo.username : "unknown") + "> not allowed to delete resource with label " diff --git a/pkg/operator/controllers/guardrails/policies-vap/vap/aro-privileged-namespace-deny.yaml b/pkg/operator/controllers/guardrails/policies-vap/vap/aro-privileged-namespace-deny.yaml new file mode 100644 index 00000000000..1706bdc6d2e --- /dev/null +++ b/pkg/operator/controllers/guardrails/policies-vap/vap/aro-privileged-namespace-deny.yaml @@ -0,0 +1,79 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: aro-privileged-namespace-deny +spec: + failurePolicy: Ignore #TODO: how to change this to eg, Fail? via old patch or allow change this field directly? + matchConstraints: + resourceRules: + - apiGroups: [""] + apiVersions: ["*"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["pods", "secrets", "services", "serviceaccounts", "replicationcontrollers", "resourcequotas", "namespaces"] + - apiGroups: ["apps"] + apiVersions: ["*"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + - apiGroups: ["batch"] + apiVersions: ["*"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["jobs", "cronjobs"] + - apiGroups: ["rbac.authorization.k8s.io"] + apiVersions: ["*"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["roles", "rolebindings"] + - apiGroups: ["policy"] + apiVersions: ["*"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["poddisruptionbudgets"] + variables: + - name: privilegedNamespaces + expression: | + [ + "kube-node-lease", "kube-public", "kube-system", + "openshift-azure-logging", "openshift-azure-operator", + "openshift-managed-upgrade-operator", "openshift-azure-guardrails", + "openshift", "openshift-apiserver", "openshift-apiserver-operator", + "openshift-authentication-operator", "openshift-cloud-controller-manager", + "openshift-cloud-controller-manager-operator", "openshift-cloud-credential-operator", + "openshift-cluster-machine-approver", "openshift-cluster-storage-operator", + "openshift-cluster-version", "openshift-config-managed", "openshift-config-operator", + "openshift-console", "openshift-console-operator", "openshift-controller-manager", + "openshift-controller-manager-operator", "openshift-dns", "openshift-dns-operator", + "openshift-etcd", "openshift-etcd-operator", "openshift-host-network", + "openshift-image-registry", "openshift-ingress", "openshift-ingress-operator", + "openshift-kube-apiserver", "openshift-kube-apiserver-operator", + "openshift-kube-controller-manager", "openshift-kube-controller-manager-operator", + "openshift-kube-scheduler", "openshift-kube-scheduler-operator", + "openshift-machine-api", "openshift-machine-config-operator", "openshift-monitoring", + "openshift-multus", "openshift-network-operator", "openshift-oauth-apiserver", + "openshift-operator-lifecycle-manager", "openshift-ovn-kubernetes", "openshift-sdn", + "openshift-service-ca", "openshift-service-ca-operator" + ] + - name: exemptedUsers + expression: | + ["system:kube-controller-manager", "system:kube-scheduler", "system:admin", "system:aro-service"] + - name: targetObj + expression: | + request.operation == "DELETE" ? oldObject : object + - name: targetNamespace + expression: | + !has(variables.targetObj.metadata) ? "" : + (request.kind.kind == "Namespace" ? variables.targetObj.metadata.name : + (has(variables.targetObj.metadata.namespace) ? variables.targetObj.metadata.namespace : "")) + - name: isExempted + expression: | + !has(request.userInfo) || + !has(request.userInfo.username) || + request.userInfo.username in variables.exemptedUsers || + (has(request.userInfo.groups) && request.userInfo.groups.exists(g, + g.contains("system:node") || g.contains("system:serviceaccount") || g.contains("system:masters") + )) + validations: + - expression: | + !(variables.targetNamespace in variables.privilegedNamespaces) || variables.isExempted + message: "Operation not allowed in privileged namespace" + messageExpression: | + request.kind.kind == "Namespace" + ? "user " + (has(request.userInfo.username) ? request.userInfo.username : "unknown") + " not allowed to " + request.operation + " namespace " + (has(variables.targetObj.metadata) ? variables.targetObj.metadata.name : "unknown") + : "user " + (has(request.userInfo.username) ? request.userInfo.username : "unknown") + " not allowed to " + request.operation + " " + (has(variables.targetObj.metadata) ? variables.targetObj.metadata.name : "unknown") + " in namespace " + (has(variables.targetObj.metadata) && has(variables.targetObj.metadata.namespace) ? variables.targetObj.metadata.namespace : "unknown") diff --git a/pkg/util/dynamichelper/dynamichelper.go b/pkg/util/dynamichelper/dynamichelper.go index 7d5f47c9e2e..85ff31bce94 100644 --- a/pkg/util/dynamichelper/dynamichelper.go +++ b/pkg/util/dynamichelper/dynamichelper.go @@ -5,15 +5,19 @@ package dynamichelper import ( "context" + "encoding/json" + "fmt" "strings" "github.com/sirupsen/logrus" kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" kruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -22,6 +26,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/Azure/ARO-RP/pkg/util/clienthelper" + "github.com/Azure/ARO-RP/pkg/util/pointerutils" _ "github.com/Azure/ARO-RP/pkg/util/scheme" ) @@ -97,9 +102,19 @@ func (dh *dynamicHelper) EnsureDeletedGVR(ctx context.Context, groupKind, namesp func (dh *dynamicHelper) Ensure(ctx context.Context, objs ...kruntime.Object) error { for _, o := range objs { if un, ok := o.(*unstructured.Unstructured); ok { - err := dh.ensureUnstructuredObj(ctx, un) - if err != nil { - return err + // ValidatingAdmissionPolicy and ValidatingAdmissionPolicyBinding + // are handled via server-side apply so that all fields are + // correctly reconciled. The Gatekeeper-specific path only + // compares enforcementAction and would silently skip updates + // to other resource types. + if isAdmissionRegistrationResource(un) { + if err := dh.ensureByServerSideApply(ctx, un); err != nil { + return err + } + } else { + if err := dh.ensureGatekeeperConstraint(ctx, un); err != nil { + return err + } } continue } @@ -160,6 +175,40 @@ func (dh *dynamicHelper) mergeWithLogic(name, groupKind string, old, new kruntim return clienthelper.Merge(old.(client.Object), new.(client.Object)) } +// isAdmissionRegistrationResource returns true for ValidatingAdmissionPolicy +// and ValidatingAdmissionPolicyBinding resources that should be managed via +// server-side apply rather than the Gatekeeper-specific path. +func isAdmissionRegistrationResource(uns *unstructured.Unstructured) bool { + return uns.GroupVersionKind().Group == "admissionregistration.k8s.io" +} + +// ensureByServerSideApply creates or updates a single unstructured object +// using server-side apply. This correctly reconciles all fields, unlike +// ensureUnstructuredObj which only compares Gatekeeper's enforcementAction. +func (dh *dynamicHelper) ensureByServerSideApply(ctx context.Context, uns *unstructured.Unstructured) error { + gvr, err := dh.Resolve(uns.GroupVersionKind().GroupKind().String(), uns.GroupVersionKind().Version) + if err != nil { + return err + } + + data, err := json.Marshal(uns) + if err != nil { + return fmt.Errorf("marshalling %s/%s: %w", uns.GroupVersionKind().GroupKind(), uns.GetName(), err) + } + + dh.log.Infof("Apply %s", keyFunc(uns.GroupVersionKind().GroupKind(), uns.GetNamespace(), uns.GetName())) + _, err = dh.dynamicClient.Resource(*gvr). + Namespace(uns.GetNamespace()). + Patch(ctx, uns.GetName(), types.ApplyPatchType, data, metav1.PatchOptions{ + FieldManager: "aro-operator", + Force: pointerutils.ToPtr(true), + }) + if err != nil { + return fmt.Errorf("server-side apply %s/%s: %w", uns.GroupVersionKind().GroupKind(), uns.GetName(), err) + } + return nil +} + func makeURLSegments(gvr *schema.GroupVersionResource, namespace, name string) (url []string) { if gvr.Group == "" { url = append(url, "api") diff --git a/pkg/util/dynamichelper/guardrails.go b/pkg/util/dynamichelper/guardrails.go index 4dd280796e4..1ff708a5bee 100644 --- a/pkg/util/dynamichelper/guardrails.go +++ b/pkg/util/dynamichelper/guardrails.go @@ -23,9 +23,10 @@ import ( _ "github.com/Azure/ARO-RP/pkg/util/scheme" ) -// the UnstructuredObj related stuff is specifically for the Guardrails -// to handle the gatekeeper Constraint as it does not have a scheme that can be imported -func (dh *dynamicHelper) ensureUnstructuredObj(ctx context.Context, uns *unstructured.Unstructured) error { +// ensureGatekeeperConstraint handles Gatekeeper Constraint resources which +// do not have a scheme that can be imported. It only compares the +// spec.enforcementAction field to decide whether an update is needed. +func (dh *dynamicHelper) ensureGatekeeperConstraint(ctx context.Context, uns *unstructured.Unstructured) error { gvr, err := dh.Resolve(uns.GroupVersionKind().GroupKind().String(), uns.GroupVersionKind().Version) if err != nil { return err diff --git a/test/e2e/operator.go b/test/e2e/operator.go index d53bd2436ab..3d9ce7cd65f 100644 --- a/test/e2e/operator.go +++ b/test/e2e/operator.go @@ -17,6 +17,8 @@ import ( corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/discovery" "k8s.io/client-go/util/retry" @@ -41,6 +43,7 @@ import ( "github.com/Azure/ARO-RP/pkg/util/conditions" "github.com/Azure/ARO-RP/pkg/util/pointerutils" "github.com/Azure/ARO-RP/pkg/util/ready" + utilversion "github.com/Azure/ARO-RP/pkg/util/version" ) const ( @@ -730,48 +733,197 @@ var _ = Describe("ARO Operator - Guardrails", func() { gkAuditDeployment = "gatekeeper-audit" ) - It("Controller Manager must be restored if deleted", func(ctx context.Context) { + isGuardrailsEnabled := func(ctx context.Context) bool { instance, err := clients.AROClusters.AroV1alpha1().Clusters().Get(ctx, "cluster", metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) + return instance.Spec.OperatorFlags.GetSimpleBoolean(guardrailsEnabledFlag) && + instance.Spec.OperatorFlags.GetSimpleBoolean(guardrailsDeployManagedFlag) + } - if !instance.Spec.OperatorFlags.GetSimpleBoolean(guardrailsEnabledFlag) || - !instance.Spec.OperatorFlags.GetSimpleBoolean(guardrailsDeployManagedFlag) { - Skip("Guardrails Controller is not enabled, skipping test") + clusterIsAtLeast417 := func(ctx context.Context) bool { + cv, err := clients.ConfigClient.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + clusterVer, err := utilversion.GetClusterVersion(cv) + if err != nil { + return false } + ver417, _ := utilversion.ParseVersion("4.17.0") + return !clusterVer.Lt(ver417) + } + + // --- Pre-4.17 Gatekeeper tests --- + + Context("Gatekeeper (pre-4.17)", func() { + BeforeEach(func(ctx context.Context) { + if !isGuardrailsEnabled(ctx) { + Skip("Guardrails Controller is not enabled") + } + if clusterIsAtLeast417(ctx) { + Skip("Cluster is v4.17+, Gatekeeper is replaced by VAP") + } + }) + + It("Controller Manager must be restored if deleted", func(ctx context.Context) { + getFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Get + deleteFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Delete + + By("waiting for the gatekeeper Controller Manager deployment to be ready") + GetK8sObjectWithRetry(ctx, getFunc, gkControllerManagerDeployment, metav1.GetOptions{}) + + By("deleting the gatekeeper Controller Manager deployment") + DeleteK8sObjectWithRetry(ctx, deleteFunc, gkControllerManagerDeployment, metav1.DeleteOptions{}) + + By("waiting for the gatekeeper Controller Manager deployment to be reconciled") + GetK8sObjectWithRetry(ctx, getFunc, gkControllerManagerDeployment, metav1.GetOptions{}) + }) - getFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Get - deleteFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Delete + It("Audit must be restored if deleted", func(ctx context.Context) { + getFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Get + deleteFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Delete - By("waiting for the gatekeeper Controller Manager deployment to be ready") - GetK8sObjectWithRetry(ctx, getFunc, gkControllerManagerDeployment, metav1.GetOptions{}) + By("waiting for the gatekeeper Audit deployment to be ready") + GetK8sObjectWithRetry(ctx, getFunc, gkAuditDeployment, metav1.GetOptions{}) - By("deleting the gatekeeper Controller Manager deployment") - DeleteK8sObjectWithRetry(ctx, deleteFunc, gkControllerManagerDeployment, metav1.DeleteOptions{}) + By("deleting the gatekeeper Audit deployment") + DeleteK8sObjectWithRetry(ctx, deleteFunc, gkAuditDeployment, metav1.DeleteOptions{}) - By("waiting for the gatekeeper Controller Manager deployment to be reconciled") - GetK8sObjectWithRetry(ctx, getFunc, gkControllerManagerDeployment, metav1.GetOptions{}) + By("waiting for the gatekeeper Audit deployment to be reconciled") + GetK8sObjectWithRetry(ctx, getFunc, gkAuditDeployment, metav1.GetOptions{}) + }) + + It("Gatekeeper constraint must exist when policy is managed", func(ctx context.Context) { + templateObj := &unstructured.Unstructured{} + templateObj.SetAPIVersion("constraints.gatekeeper.sh/v1beta1") + templateObj.SetKind("ARODenyLabels") + + constraintClient, err := clients.Dynamic.GetClient(templateObj) + Expect(err).NotTo(HaveOccurred()) + + By("verifying aro-machines-deny constraint exists") + Eventually(func(g Gomega, ctx context.Context) { + obj, err := constraintClient.Get(ctx, "aro-machines-deny", metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(obj.GetName()).To(Equal("aro-machines-deny")) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + }) }) - It("Audit must be restored if deleted", func(ctx context.Context) { - instance, err := clients.AROClusters.AroV1alpha1().Clusters().Get(ctx, "cluster", metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred()) + // --- 4.17+ VAP tests --- - if !instance.Spec.OperatorFlags.GetSimpleBoolean(guardrailsEnabledFlag) || - !instance.Spec.OperatorFlags.GetSimpleBoolean(guardrailsDeployManagedFlag) { - Skip("Guardrails Controller is not enabled, skipping test") - } + Context("ValidatingAdmissionPolicy (v4.17+)", func() { + var vapClient func(ctx context.Context, name string, options metav1.GetOptions) (*unstructured.Unstructured, error) + var vapBindingClient func(ctx context.Context, name string, options metav1.GetOptions) (*unstructured.Unstructured, error) - getFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Get - deleteFunc := clients.Kubernetes.AppsV1().Deployments(guardrailsNamespace).Delete + BeforeEach(func(ctx context.Context) { + if !isGuardrailsEnabled(ctx) { + Skip("Guardrails Controller is not enabled") + } + if !clusterIsAtLeast417(ctx) { + Skip("Cluster is pre-4.17, using Gatekeeper instead of VAP") + } - By("waiting for the gatekeeper Audit deployment to be ready") - GetK8sObjectWithRetry(ctx, getFunc, gkAuditDeployment, metav1.GetOptions{}) + vapObj := &unstructured.Unstructured{} + vapObj.SetAPIVersion("admissionregistration.k8s.io/v1") + vapObj.SetKind("ValidatingAdmissionPolicy") + vc, err := clients.Dynamic.GetClient(vapObj) + Expect(err).NotTo(HaveOccurred()) + vapClient = vc.Get - By("deleting the gatekeeper Audit deployment") - DeleteK8sObjectWithRetry(ctx, deleteFunc, gkAuditDeployment, metav1.DeleteOptions{}) + bindingObj := &unstructured.Unstructured{} + bindingObj.SetAPIVersion("admissionregistration.k8s.io/v1") + bindingObj.SetKind("ValidatingAdmissionPolicyBinding") + bc, err := clients.Dynamic.GetClient(bindingObj) + Expect(err).NotTo(HaveOccurred()) + vapBindingClient = bc.Get + }) + + It("should have all expected VAP policies created", func(ctx context.Context) { + expectedPolicies := []string{ + "aro-machines-deny", + "aro-machine-config-deny", + "aro-privileged-namespace-deny", + } + + for _, name := range expectedPolicies { + By(fmt.Sprintf("verifying ValidatingAdmissionPolicy %s exists", name)) + Eventually(func(g Gomega, ctx context.Context) { + obj, err := vapClient(ctx, name, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(obj.GetName()).To(Equal(name)) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + } + }) + + It("should have all expected VAP bindings created", func(ctx context.Context) { + expectedBindings := []string{ + "aro-machines-deny-binding", + "aro-machine-config-deny-binding", + "aro-privileged-namespace-deny-binding", + } + + for _, name := range expectedBindings { + By(fmt.Sprintf("verifying ValidatingAdmissionPolicyBinding %s exists", name)) + Eventually(func(g Gomega, ctx context.Context) { + obj, err := vapBindingClient(ctx, name, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(obj.GetName()).To(Equal(name)) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + } + }) + + It("should remove and recreate VAP policy when toggled via operator flags", func(ctx context.Context) { + policyName := "aro-machines-deny" + bindingName := "aro-machines-deny-binding" + + By("verifying the VAP policy exists before disabling") + Eventually(func(g Gomega, ctx context.Context) { + _, err := vapClient(ctx, policyName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) - By("waiting for the gatekeeper Audit deployment to be reconciled") - GetK8sObjectWithRetry(ctx, getFunc, gkAuditDeployment, metav1.GetOptions{}) + By("disabling the aro-machines-deny policy via operator flag") + patchPayload := []byte(`[{"op": "replace", "path": "/spec/operatorflags/aro.guardrails.policies.aro-machines-deny.managed", "value": "false"}]`) + _, err := clients.AROClusters.AroV1alpha1().Clusters().Patch(ctx, "cluster", types.JSONPatchType, patchPayload, metav1.PatchOptions{}) + Expect(err).NotTo(HaveOccurred()) + DeferCleanup(func(ctx context.Context) { + revert := []byte(`[{"op": "replace", "path": "/spec/operatorflags/aro.guardrails.policies.aro-machines-deny.managed", "value": "true"}]`) + _, err := clients.AROClusters.AroV1alpha1().Clusters().Patch(ctx, "cluster", types.JSONPatchType, revert, metav1.PatchOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + By("waiting for the VAP policy to be removed") + Eventually(func(g Gomega, ctx context.Context) { + _, err := vapClient(ctx, policyName, metav1.GetOptions{}) + g.Expect(err).To(HaveOccurred()) + g.Expect(kerrors.IsNotFound(err)).To(BeTrue()) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + + By("waiting for the VAP binding to be removed") + Eventually(func(g Gomega, ctx context.Context) { + _, err := vapBindingClient(ctx, bindingName, metav1.GetOptions{}) + g.Expect(err).To(HaveOccurred()) + g.Expect(kerrors.IsNotFound(err)).To(BeTrue()) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + + By("re-enabling the aro-machines-deny policy via operator flag") + reEnable := []byte(`[{"op": "replace", "path": "/spec/operatorflags/aro.guardrails.policies.aro-machines-deny.managed", "value": "true"}]`) + _, err = clients.AROClusters.AroV1alpha1().Clusters().Patch(ctx, "cluster", types.JSONPatchType, reEnable, metav1.PatchOptions{}) + Expect(err).NotTo(HaveOccurred()) + + By("waiting for the VAP policy to be recreated") + Eventually(func(g Gomega, ctx context.Context) { + obj, err := vapClient(ctx, policyName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(obj.GetName()).To(Equal(policyName)) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + + By("waiting for the VAP binding to be recreated") + Eventually(func(g Gomega, ctx context.Context) { + obj, err := vapBindingClient(ctx, bindingName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(obj.GetName()).To(Equal(bindingName)) + }).WithContext(ctx).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) + }) }) })