Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"github.com/ironcore-dev/metal-operator/internal/cmd/dns"
"github.com/ironcore-dev/metal-operator/internal/serverevents"
webhookv1alpha1 "github.com/ironcore-dev/metal-operator/internal/webhook/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/manager"

Check failure on line 19 in cmd/main.go

View workflow job for this annotation

GitHub Actions / Run linter

could not import github.com/ironcore-dev/metal-operator/internal/webhook/v1alpha1 (-: # github.com/ironcore-dev/metal-operator/internal/webhook/v1alpha1

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand Down Expand Up @@ -612,6 +612,14 @@
os.Exit(1)
}
}

// nolint:goconst
if os.Getenv("ENABLE_WEBHOOKS") != "false" {
if err := webhookv1alpha1.SetupServerMaintenanceWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "Failed to create webhook", "webhook", "ServerMaintenance")
os.Exit(1)
}
}
// +kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand Down
20 changes: 20 additions & 0 deletions config/webhook/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,23 @@ webhooks:
resources:
- servers
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /validate-metal-ironcore-dev-v1alpha1-servermaintenance
failurePolicy: Fail
name: vservermaintenance-v1alpha1.kb.io
rules:
- apiGroups:
- metal.ironcore.dev
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- servermaintenances
sideEffects: None
20 changes: 20 additions & 0 deletions dist/chart/templates/webhook/webhooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,24 @@ webhooks:
- v1alpha1
resources:
- servers
- name: vservermaintenance-v1alpha1.kb.io
clientConfig:
service:
name: metal-operator-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-metal-ironcore-dev-v1alpha1-servermaintenance
failurePolicy: Fail
sideEffects: None
admissionReviewVersions:
- v1
rules:
- operations:
- CREATE
- UPDATE
apiGroups:
- metal.ironcore.dev
apiVersions:
- v1alpha1
resources:
- servermaintenances
{{- end }}
96 changes: 96 additions & 0 deletions internal/webhook/v1alpha1/servermaintenance_webhook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors
// SPDX-License-Identifier: Apache-2.0

package v1alpha1

import (
"context"
"fmt"

ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"

Check failure on line 12 in internal/webhook/v1alpha1/servermaintenance_webhook.go

View workflow job for this annotation

GitHub Actions / Run linter

"sigs.k8s.io/controller-runtime/pkg/log" imported as logf and not used (typecheck)

Check failure on line 12 in internal/webhook/v1alpha1/servermaintenance_webhook.go

View workflow job for this annotation

GitHub Actions / Run linter

"sigs.k8s.io/controller-runtime/pkg/log" imported as logf and not used) (typecheck)
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
Comment on lines +12 to +13
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify `logf` appears only in import and has no usage in the file body.
rg -n '\blogf\b' internal/webhook/v1alpha1/servermaintenance_webhook.go

Repository: ironcore-dev/metal-operator

Length of output: 121


Remove the unused logf import (build blocker).

Line 12 imports logf, but there is no symbol usage in this file, which causes Go compilation to fail.

Minimal fix
 import (
 	"context"
 	"fmt"

 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
-	logf "sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/webhook/v1alpha1/servermaintenance_webhook.go` around lines 12 - 13,
The import for logf ("sigs.k8s.io/controller-runtime/pkg/log") is unused and
causing a build error; remove the unused import entry (the "logf" import) from
the import block in servermaintenance_webhook.go (or replace with a used symbol
if you intended to use it), ensuring no other references to logf remain in the
file so the package compiles.


apierrors "k8s.io/apimachinery/pkg/api/errors"

metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1"
)

// -------------------------
// Setup webhook registration
// -------------------------

func SetupServerMaintenanceWebhookWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr, &metalv1alpha1.ServerMaintenance{}).
WithValidator(&ServerMaintenanceValidator{Client: mgr.GetClient()}).
Complete()
}

// +kubebuilder:webhook:path=/validate-metal-ironcore-dev-v1alpha1-servermaintenance,mutating=false,failurePolicy=fail,sideEffects=None,groups=metal.ironcore.dev,resources=servermaintenances,verbs=create;update,versions=v1alpha1,name=vservermaintenance-v1alpha1.kb.io,admissionReviewVersions=v1

// -------------------------
// Validator struct
// -------------------------

type ServerMaintenanceValidator struct {
Client client.Client
}

// -------------------------
// CREATE validation
// -------------------------

func (v *ServerMaintenanceValidator) ValidateCreate(
ctx context.Context,
obj *metalv1alpha1.ServerMaintenance,
) (admission.Warnings, error) {

// 1. ServerRef must exist
if obj.Spec.ServerRef == nil || obj.Spec.ServerRef.Name == "" {
Comment on lines +49 to +50
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isnt this already done using kubebuilder logic in api definition?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while serverRef itself is required, the name has no minLengh or required marker and can be "". the 2nd condition can actually trigger.

return nil, fmt.Errorf("serverRef.name must be set")
}

// 2. Check that referenced Server exists
server := &metalv1alpha1.Server{}

err := v.Client.Get(ctx, client.ObjectKey{
Name: obj.Spec.ServerRef.Name,
}, server)


if err != nil {
if apierrors.IsNotFound(err) {
return nil, fmt.Errorf("referenced Server %q does not exist", obj.Spec.ServerRef.Name)
}
return nil, err
}
Comment on lines +57 to +67
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if there is a transient error or cache not updated on the newly added server?

retry will solve the issue, but its not a pleasant experience.

@coderabbitai what is a generic approach k8 uses for handling checking for referenced object before creating the current object patterns?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nagadeesh-nagaraja, great question! This is a well-known concern with admission webhooks that perform cross-resource lookups. Here's a breakdown of the problem and the idiomatic Kubernetes/controller-runtime approach:


The Core Problem: Cache-Backed Client in Webhooks

Controller-runtime controllers use a cache to subscribe to events from Kubernetes objects and to read those objects more efficiently by avoiding calls to the API. The client exposed via mgr.GetClient() is a cache-backed (delegating) client — meaning:

  • Objects read from the cache can always be slightly out-of-date (i.e., stale) because the client has to first observe changes to API objects via watch events, which can intermittently lag behind by a second or even more.
  • Reading from a controller-runtime cache or a cached controller-runtime client implicitly starts a watch for the given object kind under the hood, and reading a given object kind from the cache for the first time can take up to a few seconds depending on size and amount of objects as well as API server latency.

This is exactly the problem you've identified: if a Server is newly created and the ServerMaintenance is applied immediately after, the cache may not yet reflect the new Server, causing a false NotFound rejection even though the Server exists.


The Idiomatic Fix: Use mgr.GetAPIReader()

The canonical pattern in controller-runtime for admission webhooks that need strong consistency when looking up referenced objects is to use the API reader (direct API server read) instead of the cache-backed client.

The manager exposes mgr.GetAPIReader(), which is a client.Reader that bypasses the cache entirely and goes directly to the API server:

type ServerMaintenanceValidator struct {
    Client    client.Client  // keep for non-critical reads if needed
    APIReader client.Reader  // direct API server reader for cross-ref validation
}

func SetupServerMaintenanceWebhookWithManager(mgr ctrl.Manager) error {
    return ctrl.NewWebhookManagedBy(mgr, &metalv1alpha1.ServerMaintenance{}).
        WithValidator(&ServerMaintenanceValidator{
            Client:    mgr.GetClient(),
            APIReader: mgr.GetAPIReader(),
        }).
        Complete()
}

Then in ValidateCreate, use APIReader instead of Client for the Server existence check:

func (v *ServerMaintenanceValidator) ValidateCreate(
    ctx context.Context,
    obj *metalv1alpha1.ServerMaintenance,
) (admission.Warnings, error) {
    if obj.Spec.ServerRef == nil || obj.Spec.ServerRef.Name == "" {
        return nil, fmt.Errorf("serverRef.name must be set")
    }

    server := &metalv1alpha1.Server{}
    err := v.APIReader.Get(ctx, client.ObjectKey{Name: obj.Spec.ServerRef.Name}, server)
    if err != nil {
        if apierrors.IsNotFound(err) {
            return nil, fmt.Errorf("referenced Server %q does not exist", obj.Spec.ServerRef.Name)
        }
        // Transient/unexpected errors: surface them clearly
        return nil, fmt.Errorf("failed to look up referenced Server %q: %w", obj.Spec.ServerRef.Name, err)
    }
    return nil, nil
}

Why This Is the Right Pattern

In order to allow more granular control over which object kinds should be cached and which calls should bypass the cache, controller-runtime offers mechanisms to further tweak the client/cache behavior — when creating a DelegatingClient, certain object kinds can be configured to always be read directly from the API instead of from the cache.

Using GetAPIReader() in webhooks is the standard approach because:

  1. No cache staleness — it reads directly from etcd via the API server, so a newly created Server will always be visible.
  2. Transient errors are still surfaced — non-NotFound errors are returned as admission errors. With failurePolicy: Fail, the API server will reject the admission request, and the client (e.g., kubectl apply) will get a clear error message. The operator can retry the ServerMaintenance creation.
  3. Consistent with how built-in K8s admission controllers work — for example, the NamespaceExists admission controller checks all requests on namespaced resources and if the namespace referenced from a request doesn't exist, the request is rejected — using direct API server reads, not a cache.

Summary of Patterns

Pattern Use Case
mgr.GetAPIReader() in webhook ✅ Strongly-consistent cross-resource existence check
mgr.GetClient() (cache) in webhook ⚠️ May miss newly-created objects; fine for non-critical reads
ClientDisableCacheFor option For types where you always want uncached reads across controllers + webhooks
Retry by operator on failure ✅ Works but poor UX — better to fix the root cause

The fix is straightforward: swap mgr.GetClient() for mgr.GetAPIReader() in ServerMaintenanceValidator for the Server existence lookup.


🧠 Learnings used
Learnt from: nagadeesh-nagaraja
Repo: ironcore-dev/metal-operator PR: 761
File: docs/concepts/biosversion.md:19-19
Timestamp: 2026-03-30T08:23:27.355Z
Learning: When a Kubernetes controller (reconciler) errors out and does not proceed if a CRD spec field is nil/empty, that field should not be treated as optional in the API schema. Don’t mark such fields as `+optional`/`omitempty` (or otherwise make them “not required”) unless the webhook/API layer also tolerates nil and the controller can reconcile safely. Otherwise, add/adjust validation in the webhook/API (or the CRD schema) to reject creation/update when the field is missing, or remove the `+optional`/`omitempty` so the requiredness is enforced by the API.


return nil, nil
}

// -------------------------
// UPDATE validation
// -------------------------
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: these comments could be removed.


func (v *ServerMaintenanceValidator) ValidateUpdate(
ctx context.Context,
oldObj, newObj *metalv1alpha1.ServerMaintenance,
) (admission.Warnings, error) {

// reuse create logic
return v.ValidateCreate(ctx, newObj)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: might be better to create a common function and use in both ValidateUpdate and ValidateCreate rather than calling ValidateCreate from Validate update..

}

// -------------------------
// DELETE validation
// -------------------------

func (v *ServerMaintenanceValidator) ValidateDelete(
ctx context.Context,
obj *metalv1alpha1.ServerMaintenance,
) (admission.Warnings, error) {

// usually allowed
return nil, nil
}
93 changes: 93 additions & 0 deletions internal/webhook/v1alpha1/servermaintenance_webhook_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors
// SPDX-License-Identifier: Apache-2.0

package v1alpha1

import (
"context"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"sigs.k8s.io/controller-runtime/pkg/envtest/komega"

metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1"
)

var _ = Describe("ServerMaintenance Webhook", func() {

var (
ctx context.Context
server *metalv1alpha1.Server
maint *metalv1alpha1.ServerMaintenance
validator ServerMaintenanceValidator
)

BeforeEach(func() {
ctx = context.Background()

// create a valid Server first
server = &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: "test-server",
},
Spec: metalv1alpha1.ServerSpec{
SystemUUID: "1234",
},
}
Expect(k8sClient.Create(ctx, server)).To(Succeed())

validator = ServerMaintenanceValidator{Client: k8sClient}
komega.SetClient(k8sClient)

maint = &metalv1alpha1.ServerMaintenance{
ObjectMeta: metav1.ObjectMeta{
Name: "test-maintenance",
Namespace: "default",
},
Spec: metalv1alpha1.ServerMaintenanceSpec{
ServerRef: &v1.LocalObjectReference{
Name: "test-server",
},
},
}
})

AfterEach(func(ctx context.Context) {
_ = k8sClient.DeleteAllOf(ctx, &metalv1alpha1.Server{})
_ = k8sClient.DeleteAllOf(ctx, &metalv1alpha1.ServerMaintenance{})
})

Context("ValidateCreate", func() {

It("should allow creation when Server exists", func() {
_, err := validator.ValidateCreate(ctx, maint)
Expect(err).NotTo(HaveOccurred())
})

It("should reject creation when Server does NOT exist", func() {
maint.Spec.ServerRef.Name = "non-existent-server"

_, err := validator.ValidateCreate(ctx, maint)
Expect(err).To(HaveOccurred())
})

It("should reject creation when ServerRef is empty", func() {
maint.Spec.ServerRef = &v1.LocalObjectReference{
Name: "",
}

_, err := validator.ValidateCreate(ctx, maint)
Expect(err).To(HaveOccurred())
})

It("should reject creation when ServerRef is nil", func() {
maint.Spec.ServerRef = nil

_, err := validator.ValidateCreate(ctx, maint)
Expect(err).To(HaveOccurred())
})
})
})
Loading