diff --git a/bmc/bmc.go b/bmc/bmc.go index 24a22d46a..48cc5fc1c 100644 --- a/bmc/bmc.go +++ b/bmc/bmc.go @@ -127,6 +127,10 @@ type BMC interface { // CreateEventSubscription creates an event subscription for the manager. CreateEventSubscription(ctx context.Context, destination string, eventType schemas.EventFormatType, protocol schemas.DeliveryRetryPolicy) (string, error) + // GetEventSubscription checks whether the event subscription at the given URI still exists on the BMC. + // Returns (true, nil) if the subscription exists, (false, nil) if it does not, or (false, err) on failure. + GetEventSubscription(ctx context.Context, uri string) (bool, error) + // DeleteEventSubscription deletes an event subscription for the manager. DeleteEventSubscription(ctx context.Context, uri string) error diff --git a/bmc/mock/server/server.go b/bmc/mock/server/server.go index 41992287c..1ec179cef 100644 --- a/bmc/mock/server/server.go +++ b/bmc/mock/server/server.go @@ -1290,6 +1290,61 @@ func (s *MockServer) ResetAccounts() { s.accounts = loadAccountsFromEmbedded() } +// DeleteSubscription removes the subscription at uri from the mock server's state. +// Use this in tests to simulate a subscription being deleted externally on the BMC +// (e.g., by a firmware reset or admin action) so the controller's stale-link recovery +// path can be exercised. +func (s *MockServer) DeleteSubscription(uri string) { + filePath := resolvePath(uri) + // Compute the collection key from the parent Redfish URL (not the file path). + // resolvePath on the file path gives "data/.../5/index.json" whose path.Dir is + // "data/.../5" — the wrong key. The collection is stored under the key produced + // by resolvePath of the parent URL, e.g. "data/EventService/Subscriptions/index.json". + collectionKey := resolvePath(path.Dir(uri)) + + s.mu.Lock() + defer s.mu.Unlock() + + delete(s.overrides, filePath) + + // Remove the member from the collection override (or load from embedded and update). + cached, hasOverride := s.overrides[collectionKey] + var col Collection + if hasOverride { + // (1) If the cached value is not a Collection, bail rather than silently + // overwriting it with an empty Collection. + c, ok := cached.(Collection) + if !ok { + s.log.Error(nil, "Cached value for collection key is not a Collection, skipping member removal", "collectionKey", collectionKey) + return + } + col = c + } else { + // (2) Propagate ReadFile/Unmarshal errors rather than silently using an + // empty Collection. + data, err := dataFS.ReadFile(collectionKey) + if err != nil { + s.log.Error(err, "Failed to read embedded collection data", "collectionKey", collectionKey) + return + } + if err := json.Unmarshal(data, &col); err != nil { + s.log.Error(err, "Failed to parse embedded collection data", "collectionKey", collectionKey) + return + } + } + newMembers := make([]Member, 0, len(col.Members)) + for _, m := range col.Members { + if m.OdataID != uri { + newMembers = append(newMembers, m) + } + } + col.Members = newMembers + // (3) Reaching here guarantees col was validly loaded (invalid cases already + // returned above), so persist unconditionally. An empty Members slice is + // valid when the last subscription is deleted. + s.overrides[collectionKey] = col +} + // Start starts the mock server and stops on ctx cancellation. func (s *MockServer) Start(ctx context.Context) error { if s.handler == nil { diff --git a/bmc/redfish.go b/bmc/redfish.go index a631b83c1..76b9fe05a 100644 --- a/bmc/redfish.go +++ b/bmc/redfish.go @@ -12,6 +12,7 @@ import ( "io" "maps" "math/big" + "net/http" "net/url" "slices" "strings" @@ -1136,6 +1137,27 @@ func (r *RedfishBaseBMC) findExistingSubscription(destination string) (string, e return "", fmt.Errorf("existing subscription not found for destination: %s", destination) } +func (r *RedfishBaseBMC) GetEventSubscription(_ context.Context, uri string) (bool, error) { + service := r.client.GetService() + ev, err := service.EventService() + if err != nil { + return false, fmt.Errorf("failed to get event service: %w", err) + } + if !ev.ServiceEnabled { + return false, fmt.Errorf("event service is not enabled") + } + event, err := ev.GetEventSubscription(uri) + if err != nil { + // A 404 means the subscription no longer exists on the BMC — not a real error. + var redfishErr *schemas.Error + if errors.As(err, &redfishErr) && redfishErr.HTTPReturnedStatusCode == http.StatusNotFound { + return false, nil + } + return false, fmt.Errorf("failed to get event subscription: %w", err) + } + return event != nil, nil +} + func (r *RedfishBaseBMC) DeleteEventSubscription(ctx context.Context, uri string) error { service := r.client.GetService() ev, err := service.EventService() diff --git a/internal/controller/bmc_controller.go b/internal/controller/bmc_controller.go index bb464d149..3772d3e06 100644 --- a/internal/controller/bmc_controller.go +++ b/internal/controller/bmc_controller.go @@ -564,6 +564,20 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient log.V(1).Info("Handling event subscriptions for BMC", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP) modified := false + if bmcObj.Status.MetricsReportSubscriptionLink != "" { + exists, err := bmcClient.GetEventSubscription(ctx, bmcObj.Status.MetricsReportSubscriptionLink) + if err != nil { + return false, fmt.Errorf("failed to check metrics report subscription for BMC %s (%s): %w", bmcObj.Name, bmcObj.Status.IP, err) + } + if !exists { + log.Info("Metrics report subscription no longer exists on BMC, clearing stale link", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "link", bmcObj.Status.MetricsReportSubscriptionLink) + bmcBase := bmcObj.DeepCopy() + bmcObj.Status.MetricsReportSubscriptionLink = "" + if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { + return false, fmt.Errorf("failed to patch BMC status to clear stale metrics subscription link: %w", err) + } + } + } if bmcObj.Status.MetricsReportSubscriptionLink == "" { link, err := serverevents.SubscribeMetricsReport(ctx, r.EventURL, bmcObj.Name, bmcClient) if err != nil { @@ -573,10 +587,25 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient bmcObj.Status.MetricsReportSubscriptionLink = link modified = true if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { - return false, fmt.Errorf("failed to patch server status with subscription links: %w", err) + return false, fmt.Errorf("failed to patch BMC status with metrics subscription link: %w", err) } log.Info("Event subscription established", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "type", "metrics", "link", link) } + + if bmcObj.Status.EventsSubscriptionLink != "" { + exists, err := bmcClient.GetEventSubscription(ctx, bmcObj.Status.EventsSubscriptionLink) + if err != nil { + return false, fmt.Errorf("failed to check events subscription for BMC %s (%s): %w", bmcObj.Name, bmcObj.Status.IP, err) + } + if !exists { + log.Info("Events subscription no longer exists on BMC, clearing stale link", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "link", bmcObj.Status.EventsSubscriptionLink) + bmcBase := bmcObj.DeepCopy() + bmcObj.Status.EventsSubscriptionLink = "" + if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { + return false, fmt.Errorf("failed to patch BMC status to clear stale events subscription link: %w", err) + } + } + } if bmcObj.Status.EventsSubscriptionLink == "" { link, err := serverevents.SubscribeEvents(ctx, r.EventURL, bmcObj.Name, bmcClient) if err != nil { @@ -586,7 +615,7 @@ func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient bmcObj.Status.EventsSubscriptionLink = link modified = true if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { - return false, fmt.Errorf("failed to patch server status with subscription links: %w", err) + return false, fmt.Errorf("failed to patch BMC status with events subscription link: %w", err) } log.Info("Event subscription established", "bmcName", bmcObj.Name, "bmcIP", bmcObj.Status.IP, "type", "events", "link", link) } diff --git a/internal/controller/bmc_controller_test.go b/internal/controller/bmc_controller_test.go index 2c9913ebf..45b6af4a2 100644 --- a/internal/controller/bmc_controller_test.go +++ b/internal/controller/bmc_controller_test.go @@ -338,6 +338,82 @@ var _ = Describe("BMC Controller", func() { Expect(k8sClient.Delete(ctx, dnsRecord)).To(Succeed()) }) + It("should recreate event subscriptions when they are deleted externally on the BMC", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource") + bmcObj := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "23:11:8A:33:CF:EA", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + } + Expect(k8sClient.Create(ctx, bmcObj)).To(Succeed()) + + By("Waiting for initial subscriptions to be established") + Eventually(Object(bmcObj)).Should(SatisfyAll( + HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())), + HaveField("Status.EventsSubscriptionLink", Not(BeEmpty())), + )) + initialMetricsLink := bmcObj.Status.MetricsReportSubscriptionLink + initialEventsLink := bmcObj.Status.EventsSubscriptionLink + + // Delete only the metrics subscription. The events subscription (/6) remains in + // the mock server's collection, so the next POST will assign ID 7 (not 5), giving + // us a verifiably different link after recreation. + By("Simulating external deletion of the metrics subscription on the BMC") + mockServers[0].DeleteSubscription(initialMetricsLink) + + By("Triggering a reconcile by annotating the BMC object") + Eventually(Update(bmcObj, func() { + if bmcObj.Annotations == nil { + bmcObj.Annotations = map[string]string{} + } + bmcObj.Annotations["metal.ironcore.dev/reconcile-trigger"] = time.Now().Format(time.RFC3339Nano) + })).Should(Succeed()) + + By("Expecting the stale metrics subscription to be detected and recreated with a new link") + Eventually(Object(bmcObj)).Should(SatisfyAll( + HaveField("Status.MetricsReportSubscriptionLink", Not(BeEmpty())), + HaveField("Status.MetricsReportSubscriptionLink", Not(Equal(initialMetricsLink))), + HaveField("Status.EventsSubscriptionLink", Equal(initialEventsLink)), + )) + + By("Cleaning up") + server := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + Name: bmcutils.GetServerNameFromBMCandIndex(0, bmcObj), + }, + } + Expect(k8sClient.Delete(ctx, bmcObj)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + Expect(k8sClient.Delete(ctx, server)).To(Succeed()) + Eventually(Get(bmcObj)).Should(Satisfy(apierrors.IsNotFound)) + Eventually(Get(server)).Should(Satisfy(apierrors.IsNotFound)) + }) + }) var _ = Describe("BMC Validation", func() {