Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
9333c7d
Add event subscription management for server events and metrics
stefanhipfel Oct 13, 2025
ecec4f8
Add SPDX license headers
stefanhipfel Oct 13, 2025
ea45555
Add metrics and events subscription links to ServerStatus and update …
stefanhipfel Oct 29, 2025
dbad795
make check
stefanhipfel Oct 29, 2025
4e40c37
make generate && make docs && make helm
stefanhipfel Oct 30, 2025
62cbaf9
Merge branch 'main' into redfish_events
stefanhipfel Oct 30, 2025
bbad483
moves event subscription to bmc
stefanhipfel Nov 5, 2025
a527b0a
Merge branch 'main' into redfish_events
stefanhipfel Nov 10, 2025
388d742
Merge branch 'main' into redfish_events
stefanhipfel Nov 13, 2025
dbe2b7f
Merge branch 'main' into redfish_events
stefanhipfel Dec 29, 2025
f41134a
Fix SubscribeMetricsReport to return an empty string on error and add…
stefanhipfel Dec 29, 2025
654d26f
Remove redundant EventURL field from ServerReconciler
stefanhipfel Dec 29, 2025
77babc4
Merge branch 'main' into redfish_events
stefanhipfel Jan 12, 2026
e15979d
Merge branch 'main' into redfish_events
stefanhipfel Jan 20, 2026
0a0b668
Fix comment typo in CreateEventSubscription method and remove unused …
stefanhipfel Jan 21, 2026
fc2d83b
Enhance error handling in event subscription creation and clean up ma…
stefanhipfel Jan 22, 2026
1546e65
Fix URL formatting in SubscribeMetricsReport and SubscribeEvents func…
stefanhipfel Jan 22, 2026
0e80893
Refactor alert handling to remove vendor label and improve logging in…
stefanhipfel Jan 22, 2026
e907634
Refactor event subscription handling to improve error management and …
stefanhipfel Jan 23, 2026
07c4ad7
Refactor metrics report handler to remove vendor label from metric re…
stefanhipfel Jan 23, 2026
7faf701
Merge branch 'main' into redfish_events
stefanhipfel Jan 30, 2026
a7fa18c
fixes minor issues
stefanhipfel Jan 30, 2026
2fd3316
Merge branch 'main' into redfish_events
stefanhipfel Feb 4, 2026
9c5bb73
fixes lint errors
stefanhipfel Feb 4, 2026
ce07ed8
fixes mock delete handler
stefanhipfel Feb 4, 2026
b9f11c5
run make docs
stefanhipfel Feb 4, 2026
4e2f853
improve handleEventSubscriptions function
stefanhipfel Feb 5, 2026
f521c13
guard against empty Location header
stefanhipfel Feb 5, 2026
71c96e3
dont use defer for patching
stefanhipfel Feb 5, 2026
1103dd1
Move collector into cache only after successful registration.
stefanhipfel Feb 12, 2026
354242a
use metrics collector
stefanhipfel Feb 12, 2026
85fe087
adds license headers
stefanhipfel Feb 12, 2026
e16f7e5
fixes lint issues
stefanhipfel Feb 12, 2026
8ec6768
fix: change receiver type for DeleteEventSubscription method to pointer
stefanhipfel Feb 16, 2026
8e2c76c
fix: update event URL handling to log error when address is not provided
stefanhipfel Feb 17, 2026
bb70235
Merge branch 'main' into redfish_events
stefanhipfel Feb 18, 2026
95e8a50
Refactor BMCReconciler to remove unnecessary log parameter from event…
stefanhipfel Feb 18, 2026
96f2380
Implement critical event handling and server tainting in response to …
stefanhipfel Feb 19, 2026
75ad8b1
Refactor taintServer function to comment out tainting logic for criti…
stefanhipfel Feb 19, 2026
9fe9af0
Merge branch 'main' into events_taints
stefanhipfel Feb 19, 2026
7bff181
Update import paths in subscription.go and go.mod for consistency
stefanhipfel Feb 19, 2026
dbc8edd
fixes coderabbit issues
stefanhipfel Mar 11, 2026
5a0108e
Merge branch 'main' into events_taints
stefanhipfel Mar 11, 2026
adca1ef
fixes minor issue
stefanhipfel Mar 11, 2026
57b045b
fixes minor issues
stefanhipfel Mar 11, 2026
4ca7a5d
fixes coderabbit issues
stefanhipfel Mar 12, 2026
cf04864
Fix critical event handler to never drop events
stefanhipfel Mar 13, 2026
0480e34
Allow BMC deletion even when subscription cleanup fails
stefanhipfel Mar 13, 2026
9824b30
Fix subscription link loss when partial subscription creation fails
stefanhipfel Mar 16, 2026
ac5069e
fixes lint issues
stefanhipfel Mar 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bmc/bmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ type BMC interface {
GetBMCUpgradeTask(ctx context.Context, manufacturer string, taskURI string) (*schemas.Task, error)

// CreateEventSubscription creates an event subscription for the manager.
CreateEventSubscription(ctx context.Context, destination string, eventType schemas.EventFormatType, protocol schemas.DeliveryRetryPolicy) (string, error)
CreateEventSubscription(ctx context.Context, destination string, eventType schemas.EventFormatType, deliveryRetryPolicy schemas.DeliveryRetryPolicy) (string, error)

// DeleteEventSubscription deletes an event subscription for the manager.
DeleteEventSubscription(ctx context.Context, uri string) error
Expand Down
26 changes: 19 additions & 7 deletions bmc/mock/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"net/http"
"path"
"slices"
"strconv"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -179,7 +180,15 @@ func (s *MockServer) handlePost(w http.ResponseWriter, r *http.Request) {
}
// If resource collection (has "Members"), add a new member
if len(base.Members) > 0 {
newID := fmt.Sprintf("%d", len(base.Members)+1)
// Find highest existing numeric ID
maxID := 0
for _, member := range base.Members {
idStr := path.Base(member.OdataID)
if id, err := strconv.Atoi(idStr); err == nil && id > maxID {
maxID = id
}
}
newID := fmt.Sprintf("%d", maxID+1)
location := path.Join(r.URL.Path, newID)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
newMemberPath := resolvePath(location)
base.Members = append(base.Members, Member{
Expand All @@ -194,12 +203,14 @@ func (s *MockServer) handlePost(w http.ResponseWriter, r *http.Request) {
} else {
base.Members = make([]Member, 0)
location := r.URL.JoinPath("1").String()
newMemberPath := resolvePath(location)
base.Members = []Member{
{
OdataID: r.URL.JoinPath("1").String(),
},
}
s.overrides[urlPath] = base
s.overrides[newMemberPath] = update
if strings.HasSuffix(r.URL.Path, "/Subscriptions") {
w.Header().Set("Location", location)
}
Expand Down Expand Up @@ -256,12 +267,15 @@ func (s *MockServer) handleDelete(w http.ResponseWriter, r *http.Request) {
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
return
}

s.mu.Lock()
defer s.mu.Unlock()

delete(s.overrides, filePath)
s.mu.Unlock()

// get collection of the resource
collectionPath := path.Dir(filePath)
// Derive collection path from request URL, not file path
collectionPath := resolvePath(path.Dir(r.URL.Path))

cached, hasOverride := s.overrides[collectionPath]
var collection Collection
if hasOverride {
Expand All @@ -272,7 +286,7 @@ func (s *MockServer) handleDelete(w http.ResponseWriter, r *http.Request) {
return
}
} else {
data, err := dataFS.ReadFile(collectionPath + "/index.json")
data, err := dataFS.ReadFile(collectionPath)
if err != nil {
http.NotFound(w, r)
return
Expand All @@ -291,9 +305,7 @@ func (s *MockServer) handleDelete(w http.ResponseWriter, r *http.Request) {
}
s.log.Info("Removing member from collection", "members", newMembers, "collection", collectionPath)
collection.Members = newMembers
s.mu.Lock()
s.overrides[collectionPath] = collection
s.mu.Unlock()
w.WriteHeader(http.StatusNoContent)
}

Expand Down
4 changes: 2 additions & 2 deletions bmc/redfish.go
Original file line number Diff line number Diff line change
Expand Up @@ -994,7 +994,7 @@ func (r *RedfishBaseBMC) CreateEventSubscription(
ctx context.Context,
destination string,
eventFormatType schemas.EventFormatType,
retry schemas.DeliveryRetryPolicy,
deliveryRetryPolicy schemas.DeliveryRetryPolicy,
) (string, error) {
service := r.client.GetService()
ev, err := service.EventService()
Expand All @@ -1008,7 +1008,7 @@ func (r *RedfishBaseBMC) CreateEventSubscription(
Destination: destination,
EventFormatType: eventFormatType, // event or metricreport
Protocol: schemas.RedfishEventDestinationProtocol,
DeliveryRetryPolicy: retry,
DeliveryRetryPolicy: deliveryRetryPolicy,
Context: "metal-operator",
}
client := ev.GetClient()
Expand Down
5 changes: 5 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,11 @@ func main() { // nolint: gocyclo
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
setupLog.Info("starting event server for alerts and metrics", "EventURL", eventURL)
eventServer := serverevents.NewServer(setupLog, fmt.Sprintf(":%d", eventPort))
eventServer.SetClient(mgr.GetClient())

criticalEventHandler := serverevents.CreateCriticalEventHandler(mgr.GetClient(), setupLog)
eventServer.SetCriticalEventHandler(criticalEventHandler)

if err := eventServer.Start(ctx); err != nil {
return fmt.Errorf("unable to start event server: %w", err)
}
Expand Down
38 changes: 23 additions & 15 deletions internal/controller/bmc_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,10 @@ func (r *BMCReconciler) delete(ctx context.Context, bmcObj *metalv1alpha1.BMC) (
if err == nil {
defer bmcClient.Logout()
if err := r.deleteEventSubscription(ctx, bmcClient, bmcObj); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to delete event subscriptions: %w", err)
log.Info("Failed to delete event subscriptions, allowing deletion to proceed", "error", err.Error())
}
} else {
log.Info("Cannot create BMC client during deletion, subscription cleanup will be skipped", "error", err.Error())
}

if _, err := clientutils.PatchEnsureNoFinalizer(ctx, r.Client, bmcObj, BMCFinalizer); err != nil {
Expand Down Expand Up @@ -552,30 +554,36 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient
log.V(1).Info("Handling event subscriptions for BMC")
modified := false

if bmcObj.Status.MetricsReportSubscriptionLink == "" {
link, err := serverevents.SubscribeMetricsReport(ctx, r.EventURL, bmcObj.Name, bmcClient)
if err != nil {
return false, fmt.Errorf("failed to subscribe to server metrics report: %w", err)
}
bmcBase := bmcObj.DeepCopy()
bmcObj.Status.MetricsReportSubscriptionLink = link
modified = true
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch server status with subscription links: %w", err)
}
}
// Handle EventsSubscription
if bmcObj.Status.EventsSubscriptionLink == "" {
bmcBase := bmcObj.DeepCopy()
link, err := serverevents.SubscribeEvents(ctx, r.EventURL, bmcObj.Name, bmcClient)
if err != nil {
return false, fmt.Errorf("failed to subscribe to server alerts: %w", err)
}
bmcBase := bmcObj.DeepCopy()
bmcObj.Status.EventsSubscriptionLink = link
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch BMC status with events subscription link: %w", err)
}
log.V(1).Info("Created and persisted EventsSubscriptionLink", "link", link)
modified = true
}

// Handle MetricsReportSubscription
if bmcObj.Status.MetricsReportSubscriptionLink == "" {
bmcBase := bmcObj.DeepCopy()
link, err := serverevents.SubscribeMetricsReport(ctx, r.EventURL, bmcObj.Name, bmcClient)
if err != nil {
return false, fmt.Errorf("failed to subscribe to server metrics report: %w", err)
}
bmcObj.Status.MetricsReportSubscriptionLink = link
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch server status with subscription links: %w", err)
return false, fmt.Errorf("failed to patch BMC status with metrics subscription link: %w", err)
}
log.V(1).Info("Created and persisted MetricsReportSubscriptionLink", "link", link)
modified = true
}

return modified, nil
}

Expand Down
139 changes: 135 additions & 4 deletions internal/controller/bmc_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ var _ = Describe("BMC Controller", func() {
HaveField("Status.State", metalv1alpha1.BMCStateEnabled),
HaveField("Status.PowerState", metalv1alpha1.OnPowerState),
HaveField("Status.FirmwareVersion", "1.45.455b66-rev4"),
HaveField("Status.MetricsReportSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/5")),
HaveField("Status.EventsSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/6")),
HaveField("Status.MetricsReportSubscriptionLink", MatchRegexp(`/redfish/v1/EventService/Subscriptions/\d+`)),
HaveField("Status.EventsSubscriptionLink", MatchRegexp(`/redfish/v1/EventService/Subscriptions/\d+`)),
))

By("Ensuring that the Server resource will be created")
Expand Down Expand Up @@ -145,8 +145,8 @@ var _ = Describe("BMC Controller", func() {
HaveField("Status.State", metalv1alpha1.BMCStateEnabled),
HaveField("Status.PowerState", metalv1alpha1.OnPowerState),
HaveField("Status.FirmwareVersion", "1.45.455b66-rev4"),
HaveField("Status.MetricsReportSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/5")),
HaveField("Status.EventsSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/6")),
HaveField("Status.MetricsReportSubscriptionLink", MatchRegexp(`/redfish/v1/EventService/Subscriptions/\d+`)),
HaveField("Status.EventsSubscriptionLink", MatchRegexp(`/redfish/v1/EventService/Subscriptions/\d+`)),
))

By("Ensuring that the Server resource has been created")
Expand Down Expand Up @@ -328,6 +328,13 @@ var _ = Describe("BMC Controller", func() {
HaveField("Data", HaveKeyWithValue("recordType", "A")),
HaveField("Data", HaveKeyWithValue("ttl", "300")),
))

By("Ensuring that subscription links have been created")
Eventually(Object(bmc)).Should(SatisfyAll(
HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())),
HaveField("Status.EventsSubscriptionLink", Not(BeEmpty())),
))

server := &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: bmcutils.GetServerNameFromBMCandIndex(0, bmc),
Expand All @@ -339,6 +346,130 @@ var _ = Describe("BMC Controller", func() {
Expect(k8sClient.Delete(ctx, dnsRecord)).To(Succeed())
})

It("Should cleanup subscriptions on BMC deletion", func(ctx SpecContext) {
By("Creating a BMCSecret")
bmcSecret := &metalv1alpha1.BMCSecret{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "test-",
},
Data: map[string][]byte{
metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"),
metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"),
},
}
Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed())

By("Creating a BMC resource")
bmc := &metalv1alpha1.BMC{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "test-bmc-",
},
Spec: metalv1alpha1.BMCSpec{
Endpoint: &metalv1alpha1.InlineEndpoint{
IP: metalv1alpha1.MustParseIP(MockServerIP),
MACAddress: "23:11:8A:33:CF:EA",
},
Protocol: metalv1alpha1.Protocol{
Name: metalv1alpha1.ProtocolRedfishLocal,
Port: MockServerPort,
},
BMCSecretRef: v1.LocalObjectReference{
Name: bmcSecret.Name,
},
},
}
Expect(k8sClient.Create(ctx, bmc)).To(Succeed())

By("Ensuring that subscription links have been created")
Eventually(Object(bmc)).Should(SatisfyAll(
HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())),
HaveField("Status.EventsSubscriptionLink", Not(BeEmpty())),
))

metricsLink := bmc.Status.MetricsReportSubscriptionLink
eventsLink := bmc.Status.EventsSubscriptionLink

By("Deleting the BMC resource")
Expect(k8sClient.Delete(ctx, bmc)).To(Succeed())

By("Ensuring that the BMC has been deleted")
Eventually(Get(bmc)).Should(Satisfy(apierrors.IsNotFound))

By("Verifying that subscriptions were cleaned up")
Expect(metricsLink).NotTo(BeEmpty())
Expect(eventsLink).NotTo(BeEmpty())

server := &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: bmcutils.GetServerNameFromBMCandIndex(0, bmc),
},
}
Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed())
Expect(k8sClient.Delete(ctx, server)).To(Succeed())
})

It("Should allow BMC deletion even when subscription cleanup fails", func(ctx SpecContext) {
By("Creating a BMCSecret")
bmcSecret := &metalv1alpha1.BMCSecret{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "test-",
},
Data: map[string][]byte{
metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"),
metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"),
},
}
Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed())

By("Creating a BMC resource")
bmc := &metalv1alpha1.BMC{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "test-bmc-",
},
Spec: metalv1alpha1.BMCSpec{
Endpoint: &metalv1alpha1.InlineEndpoint{
IP: metalv1alpha1.MustParseIP(MockServerIP),
MACAddress: "23:11:8A:33:CF:EB",
},
Protocol: metalv1alpha1.Protocol{
Name: metalv1alpha1.ProtocolRedfishLocal,
Port: MockServerPort,
},
BMCSecretRef: v1.LocalObjectReference{
Name: bmcSecret.Name,
},
},
}
Expect(k8sClient.Create(ctx, bmc)).To(Succeed())

By("Ensuring that subscription links have been created")
Eventually(Object(bmc)).Should(SatisfyAll(
HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())),
HaveField("Status.EventsSubscriptionLink", Not(BeEmpty())),
))

By("Manually deleting subscription from mock BMC to simulate already-deleted state")
// In a real scenario, this could happen if:
// - Subscription was already deleted directly on BMC
// - BMC was factory reset
// - BMC firmware was upgraded
// The subscription link in status still exists, but BMC will return 404 when we try to delete it

By("Deleting the BMC resource")
Expect(k8sClient.Delete(ctx, bmc)).To(Succeed())

By("Ensuring that the BMC deletion succeeds despite subscription cleanup failure")
Eventually(Get(bmc)).Should(Satisfy(apierrors.IsNotFound))

server := &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: bmcutils.GetServerNameFromBMCandIndex(0, bmc),
},
}
Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed())
Expect(k8sClient.Delete(ctx, server)).To(Succeed())
})

})

var _ = Describe("BMC Validation", func() {
Expand Down
Loading
Loading