Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions bmc/bmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ type BMC interface {
// CreateEventSubscription creates an event subscription for the manager.
CreateEventSubscription(ctx context.Context, destination string, eventType schemas.EventFormatType, protocol schemas.DeliveryRetryPolicy) (string, error)

// GetEventSubscription checks whether the event subscription at the given URI still exists on the BMC.
// Returns (true, nil) if the subscription exists, (false, nil) if it does not, or (false, err) on failure.
GetEventSubscription(ctx context.Context, uri string) (bool, error)

// DeleteEventSubscription deletes an event subscription for the manager.
DeleteEventSubscription(ctx context.Context, uri string) error

Expand Down
55 changes: 55 additions & 0 deletions bmc/mock/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1290,6 +1290,61 @@ func (s *MockServer) ResetAccounts() {
s.accounts = loadAccountsFromEmbedded()
}

// DeleteSubscription removes the subscription at uri from the mock server's state.
// Use this in tests to simulate a subscription being deleted externally on the BMC
// (e.g., by a firmware reset or admin action) so the controller's stale-link recovery
// path can be exercised.
func (s *MockServer) DeleteSubscription(uri string) {
filePath := resolvePath(uri)
// Compute the collection key from the parent Redfish URL (not the file path).
// resolvePath on the file path gives "data/.../5/index.json" whose path.Dir is
// "data/.../5" — the wrong key. The collection is stored under the key produced
// by resolvePath of the parent URL, e.g. "data/EventService/Subscriptions/index.json".
collectionKey := resolvePath(path.Dir(uri))

s.mu.Lock()
defer s.mu.Unlock()

delete(s.overrides, filePath)

// Remove the member from the collection override (or load from embedded and update).
cached, hasOverride := s.overrides[collectionKey]
var col Collection
if hasOverride {
// (1) If the cached value is not a Collection, bail rather than silently
// overwriting it with an empty Collection.
c, ok := cached.(Collection)
if !ok {
s.log.Error(nil, "Cached value for collection key is not a Collection, skipping member removal", "collectionKey", collectionKey)
return
}
col = c
} else {
// (2) Propagate ReadFile/Unmarshal errors rather than silently using an
// empty Collection.
data, err := dataFS.ReadFile(collectionKey)
if err != nil {
s.log.Error(err, "Failed to read embedded collection data", "collectionKey", collectionKey)
return
}
if err := json.Unmarshal(data, &col); err != nil {
s.log.Error(err, "Failed to parse embedded collection data", "collectionKey", collectionKey)
return
}
Comment thread
asergeant01 marked this conversation as resolved.
}
newMembers := make([]Member, 0, len(col.Members))
for _, m := range col.Members {
if m.OdataID != uri {
newMembers = append(newMembers, m)
}
}
col.Members = newMembers
// (3) Reaching here guarantees col was validly loaded (invalid cases already
// returned above), so persist unconditionally. An empty Members slice is
// valid when the last subscription is deleted.
s.overrides[collectionKey] = col
}

// Start starts the mock server and stops on ctx cancellation.
func (s *MockServer) Start(ctx context.Context) error {
if s.handler == nil {
Expand Down
22 changes: 22 additions & 0 deletions bmc/redfish.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"io"
"maps"
"math/big"
"net/http"
"net/url"
"slices"
"strings"
Expand Down Expand Up @@ -1136,6 +1137,27 @@ func (r *RedfishBaseBMC) findExistingSubscription(destination string) (string, e
return "", fmt.Errorf("existing subscription not found for destination: %s", destination)
}

func (r *RedfishBaseBMC) GetEventSubscription(_ context.Context, uri string) (bool, error) {
service := r.client.GetService()
ev, err := service.EventService()
if err != nil {
return false, fmt.Errorf("failed to get event service: %w", err)
}
if !ev.ServiceEnabled {
return false, fmt.Errorf("event service is not enabled")
}
event, err := ev.GetEventSubscription(uri)
if err != nil {
// A 404 means the subscription no longer exists on the BMC — not a real error.
var redfishErr *schemas.Error
if errors.As(err, &redfishErr) && redfishErr.HTTPReturnedStatusCode == http.StatusNotFound {
return false, nil
}
return false, fmt.Errorf("failed to get event subscription: %w", err)
}
return event != nil, nil
}

func (r *RedfishBaseBMC) DeleteEventSubscription(ctx context.Context, uri string) error {
service := r.client.GetService()
ev, err := service.EventService()
Expand Down
33 changes: 31 additions & 2 deletions internal/controller/bmc_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,20 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient
log.V(1).Info("Handling event subscriptions for BMC", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP)
modified := false

if bmcObj.Status.MetricsReportSubscriptionLink != "" {
exists, err := bmcClient.GetEventSubscription(ctx, bmcObj.Status.MetricsReportSubscriptionLink)
if err != nil {
return false, fmt.Errorf("failed to check metrics report subscription for BMC %s (%s): %w", bmcObj.Name, bmcObj.Status.IP, err)
}
Comment on lines +569 to +571
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about logging this error e.g. in case we run into a transient error? Otherwise the subscription logic will block the BMC reconciliation.

if !exists {
log.Info("Metrics report subscription no longer exists on BMC, clearing stale link", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "link", bmcObj.Status.MetricsReportSubscriptionLink)
bmcBase := bmcObj.DeepCopy()
bmcObj.Status.MetricsReportSubscriptionLink = ""
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch BMC status to clear stale metrics subscription link: %w", err)
}
Comment on lines +576 to +578
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just in memory clear the link but not patch the resource as we are doing later anyway. This will reduce the API calls.

}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if bmcObj.Status.MetricsReportSubscriptionLink == "" {
link, err := serverevents.SubscribeMetricsReport(ctx, r.EventURL, bmcObj.Name, bmcClient)
if err != nil {
Expand All @@ -573,10 +587,25 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient
bmcObj.Status.MetricsReportSubscriptionLink = link
modified = true
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch server status with subscription links: %w", err)
return false, fmt.Errorf("failed to patch BMC status with metrics subscription link: %w", err)
}
log.Info("Event subscription established", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "type", "metrics", "link", link)
}

if bmcObj.Status.EventsSubscriptionLink != "" {
exists, err := bmcClient.GetEventSubscription(ctx, bmcObj.Status.EventsSubscriptionLink)
if err != nil {
return false, fmt.Errorf("failed to check events subscription for BMC %s (%s): %w", bmcObj.Name, bmcObj.Status.IP, err)
}
Comment on lines +596 to +599
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about logging this error e.g. in case we run into a transient error? Otherwise the subscription logic will block the BMC reconciliation.

if !exists {
log.Info("Events subscription no longer exists on BMC, clearing stale link", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "link", bmcObj.Status.EventsSubscriptionLink)
bmcBase := bmcObj.DeepCopy()
bmcObj.Status.EventsSubscriptionLink = ""
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch BMC status to clear stale events subscription link: %w", err)
}
Comment on lines +604 to +606
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just in memory clear the link but not patch the resource as we are doing later anyway. This will reduce the API calls.

}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if bmcObj.Status.EventsSubscriptionLink == "" {
link, err := serverevents.SubscribeEvents(ctx, r.EventURL, bmcObj.Name, bmcClient)
if err != nil {
Expand All @@ -586,7 +615,7 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient
bmcObj.Status.EventsSubscriptionLink = link
modified = true
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return false, fmt.Errorf("failed to patch server status with subscription links: %w", err)
return false, fmt.Errorf("failed to patch BMC status with events subscription link: %w", err)
}
log.Info("Event subscription established", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "type", "events", "link", link)
}
Expand Down
76 changes: 76 additions & 0 deletions internal/controller/bmc_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,82 @@ var _ = Describe("BMC Controller", func() {
Expect(k8sClient.Delete(ctx, dnsRecord)).To(Succeed())
})

It("should recreate event subscriptions when they are deleted externally on the BMC", func(ctx SpecContext) {
By("Creating a BMCSecret")
bmcSecret := &metalv1alpha1.BMCSecret{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "test-",
},
Data: map[string][]byte{
metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"),
metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"),
},
}
Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed())

By("Creating a BMC resource")
bmcObj := &metalv1alpha1.BMC{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "test-bmc-",
},
Spec: metalv1alpha1.BMCSpec{
Endpoint: &metalv1alpha1.InlineEndpoint{
IP: metalv1alpha1.MustParseIP(MockServerIP),
MACAddress: "23:11:8A:33:CF:EA",
},
Protocol: metalv1alpha1.Protocol{
Name: metalv1alpha1.ProtocolRedfishLocal,
Port: MockServerPort,
},
BMCSecretRef: v1.LocalObjectReference{
Name: bmcSecret.Name,
},
},
}
Expect(k8sClient.Create(ctx, bmcObj)).To(Succeed())

By("Waiting for initial subscriptions to be established")
Eventually(Object(bmcObj)).Should(SatisfyAll(
HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())),
HaveField("Status.EventsSubscriptionLink", Not(BeEmpty())),
))
initialMetricsLink := bmcObj.Status.MetricsReportSubscriptionLink
initialEventsLink := bmcObj.Status.EventsSubscriptionLink

// Delete only the metrics subscription. The events subscription (/6) remains in
// the mock server's collection, so the next POST will assign ID 7 (not 5), giving
// us a verifiably different link after recreation.
By("Simulating external deletion of the metrics subscription on the BMC")
mockServers[0].DeleteSubscription(initialMetricsLink)

By("Triggering a reconcile by annotating the BMC object")
Eventually(Update(bmcObj, func() {
if bmcObj.Annotations == nil {
bmcObj.Annotations = map[string]string{}
}
bmcObj.Annotations["metal.ironcore.dev/reconcile-trigger"] = time.Now().Format(time.RFC3339Nano)
})).Should(Succeed())

By("Expecting the stale metrics subscription to be detected and recreated with a new link")
Eventually(Object(bmcObj)).Should(SatisfyAll(
HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())),
HaveField("Status.MetricsReportSubscriptionLink", Not(Equal(initialMetricsLink))),
HaveField("Status.EventsSubscriptionLink", Equal(initialEventsLink)),
))

By("Cleaning up")
server := &metalv1alpha1.Server{
ObjectMeta: metav1.ObjectMeta{
Name: bmcutils.GetServerNameFromBMCandIndex(0, bmcObj),
},
}
Expect(k8sClient.Delete(ctx, bmcObj)).To(Succeed())
Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed())
Expect(k8sClient.Delete(ctx, server)).To(Succeed())
Eventually(Get(bmcObj)).Should(Satisfy(apierrors.IsNotFound))
Eventually(Get(server)).Should(Satisfy(apierrors.IsNotFound))
})

})

var _ = Describe("BMC Validation", func() {
Expand Down
Loading