Skip to content
Open
Show file tree
Hide file tree
Changes from 44 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
9333c7d
Add event subscription management for server events and metrics
stefanhipfel Oct 13, 2025
ecec4f8
Add SPDX license headers
stefanhipfel Oct 13, 2025
ea45555
Add metrics and events subscription links to ServerStatus and update …
stefanhipfel Oct 29, 2025
dbad795
make check
stefanhipfel Oct 29, 2025
4e40c37
make generate && make docs && make helm
stefanhipfel Oct 30, 2025
62cbaf9
Merge branch 'main' into redfish_events
stefanhipfel Oct 30, 2025
bbad483
moves event subscription to bmc
stefanhipfel Nov 5, 2025
a527b0a
Merge branch 'main' into redfish_events
stefanhipfel Nov 10, 2025
388d742
Merge branch 'main' into redfish_events
stefanhipfel Nov 13, 2025
dbe2b7f
Merge branch 'main' into redfish_events
stefanhipfel Dec 29, 2025
f41134a
Fix SubscribeMetricsReport to return an empty string on error and add…
stefanhipfel Dec 29, 2025
654d26f
Remove redundant EventURL field from ServerReconciler
stefanhipfel Dec 29, 2025
77babc4
Merge branch 'main' into redfish_events
stefanhipfel Jan 12, 2026
e15979d
Merge branch 'main' into redfish_events
stefanhipfel Jan 20, 2026
0a0b668
Fix comment typo in CreateEventSubscription method and remove unused …
stefanhipfel Jan 21, 2026
fc2d83b
Enhance error handling in event subscription creation and clean up ma…
stefanhipfel Jan 22, 2026
1546e65
Fix URL formatting in SubscribeMetricsReport and SubscribeEvents func…
stefanhipfel Jan 22, 2026
0e80893
Refactor alert handling to remove vendor label and improve logging in…
stefanhipfel Jan 22, 2026
e907634
Refactor event subscription handling to improve error management and …
stefanhipfel Jan 23, 2026
07c4ad7
Refactor metrics report handler to remove vendor label from metric re…
stefanhipfel Jan 23, 2026
7faf701
Merge branch 'main' into redfish_events
stefanhipfel Jan 30, 2026
a7fa18c
fixes minor issues
stefanhipfel Jan 30, 2026
2fd3316
Merge branch 'main' into redfish_events
stefanhipfel Feb 4, 2026
9c5bb73
fixes lint errors
stefanhipfel Feb 4, 2026
ce07ed8
fixes mock delete handler
stefanhipfel Feb 4, 2026
b9f11c5
run make docs
stefanhipfel Feb 4, 2026
4e2f853
improve handleEventSubscriptions function
stefanhipfel Feb 5, 2026
f521c13
guard against empty Location header
stefanhipfel Feb 5, 2026
71c96e3
dont use defer for patching
stefanhipfel Feb 5, 2026
1103dd1
Move collector into cache only after successful registration.
stefanhipfel Feb 12, 2026
354242a
use metrics collector
stefanhipfel Feb 12, 2026
85fe087
adds license headers
stefanhipfel Feb 12, 2026
e16f7e5
fixes lint issues
stefanhipfel Feb 12, 2026
8ec6768
fix: change receiver type for DeleteEventSubscription method to pointer
stefanhipfel Feb 16, 2026
8e2c76c
fix: update event URL handling to log error when address is not provided
stefanhipfel Feb 17, 2026
bb70235
Merge branch 'main' into redfish_events
stefanhipfel Feb 18, 2026
95e8a50
Refactor BMCReconciler to remove unnecessary log parameter from event…
stefanhipfel Feb 18, 2026
96f2380
Implement critical event handling and server tainting in response to …
stefanhipfel Feb 19, 2026
75ad8b1
Refactor taintServer function to comment out tainting logic for criti…
stefanhipfel Feb 19, 2026
9fe9af0
Merge branch 'main' into events_taints
stefanhipfel Feb 19, 2026
7bff181
Update import paths in subscription.go and go.mod for consistency
stefanhipfel Feb 19, 2026
dbc8edd
fixes coderabbit issues
stefanhipfel Mar 11, 2026
5a0108e
Merge branch 'main' into events_taints
stefanhipfel Mar 11, 2026
adca1ef
fixes minor issue
stefanhipfel Mar 11, 2026
57b045b
fixes minor issues
stefanhipfel Mar 11, 2026
4ca7a5d
fixes coderabbit issues
stefanhipfel Mar 12, 2026
cf04864
Fix critical event handler to never drop events
stefanhipfel Mar 13, 2026
0480e34
Allow BMC deletion even when subscription cleanup fails
stefanhipfel Mar 13, 2026
9824b30
Fix subscription link loss when partial subscription creation fails
stefanhipfel Mar 16, 2026
ac5069e
fixes lint issues
stefanhipfel Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions api/v1alpha1/bmc_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,14 @@ type BMCStatus struct {
// +optional
LastResetTime *metav1.Time `json:"lastResetTime,omitempty"`

// MetricsReportSubscriptionLink is the link to the metrics report subscription of the bmc.
// +optional
MetricsReportSubscriptionLink string `json:"metricsReportSubscriptionLink,omitempty"`

// EventsSubscriptionLink is the link to the events subscription of the bmc.
// +optional
EventsSubscriptionLink string `json:"eventsSubscriptionLink,omitempty"`

// Conditions represents the latest available observations of the BMC's current state.
// +patchStrategy=merge
// +patchMergeKey=type
Expand Down
2 changes: 1 addition & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions bmc/bmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ type BMC interface {
// GetBMCUpgradeTask retrieves the task for the BMC upgrade.
GetBMCUpgradeTask(ctx context.Context, manufacturer string, taskURI string) (*schemas.Task, error)

// CreateEventSubscription creates an event subscription for the manager.
CreateEventSubscription(ctx context.Context, destination string, eventType schemas.EventFormatType, deliveryRetryPolicy schemas.DeliveryRetryPolicy) (string, error)

// DeleteEventSubscription deletes an event subscription for the manager.
DeleteEventSubscription(ctx context.Context, uri string) error

// CreateOrUpdateAccount creates or updates a BMC user account.
CreateOrUpdateAccount(ctx context.Context, userName, role, password string, enabled bool) error

Expand Down
3 changes: 2 additions & 1 deletion bmc/mock/server/data/Managers/BMC/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"Name": "Manager",
"ManagerType": "BMC",
"Description": "Contoso BMC",
"Manufacturer": "Contoso",
"ServiceEntryPointUUID": "92384634-2938-2342-8820-489239905423",
"UUID": "58893887-8974-2487-2389-841168418919",
"Model": "Joo Janta 200",
Expand Down Expand Up @@ -96,4 +97,4 @@
},
"@odata.id": "/redfish/v1/Managers/BMC",
"@Redfish.Copyright": "Copyright 2014-2023 DMTF. For the full DMTF copyright policy, see http://www.dmtf.org/about/policies/copyright."
}
}
118 changes: 115 additions & 3 deletions bmc/mock/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"net/http"
"path"
"slices"
"strconv"
"strings"
"sync"
"time"
Expand All @@ -26,6 +27,14 @@ var (
dataFS embed.FS
)

type Collection struct {
Members []Member `json:"Members"`
}

type Member struct {
OdataID string `json:"@odata.id"`
}

const (
PowerOffState = "Off"
PowerOnState = "On"
Expand Down Expand Up @@ -116,6 +125,7 @@ func (s *MockServer) handleGet(w http.ResponseWriter, r *http.Request) {
if _, err := w.Write(content); err != nil {
s.log.Error(err, "Failed to write response")
}

}

func (s *MockServer) handlePost(w http.ResponseWriter, r *http.Request) {
Expand All @@ -135,6 +145,76 @@ func (s *MockServer) handlePost(w http.ResponseWriter, r *http.Request) {
case strings.Contains(urlPath, "UpdateService/Actions/UpdateService.SimpleUpdate"):
s.writeJSON(w, http.StatusAccepted, map[string]string{"status": "Accepted"})
default:
//
urlPath := resolvePath(r.URL.Path)
var update map[string]any
if err := json.Unmarshal(body, &update); err != nil {
http.Error(w, "Invalid JSON", http.StatusBadRequest)
return
}
// Handle resource creation in collections
s.mu.Lock()
defer s.mu.Unlock()
cached, hasOverride := s.overrides[urlPath]
var base Collection
if hasOverride {
s.log.Info("Using overridden data for POST", "path", urlPath)
var ok bool
base, ok = cached.(Collection)
if !ok {
http.Error(w, "Corrupt overridden JSON", http.StatusInternalServerError)
return
}
} else {
s.log.Info("Using embedded data for POST", "path", urlPath)
data, err := dataFS.ReadFile(urlPath)
if err != nil {
s.log.Error(err, "Failed to read embedded data for POST", "path", urlPath)
http.NotFound(w, r)
return
}
if err := json.Unmarshal(data, &base); err != nil {
http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError)
return
}
}
// If resource collection (has "Members"), add a new member
if len(base.Members) > 0 {
// Find highest existing numeric ID
maxID := 0
for _, member := range base.Members {
idStr := path.Base(member.OdataID)
if id, err := strconv.Atoi(idStr); err == nil && id > maxID {
maxID = id
}
}
newID := fmt.Sprintf("%d", maxID+1)
location := path.Join(r.URL.Path, newID)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
newMemberPath := resolvePath(location)
base.Members = append(base.Members, Member{
OdataID: location,
})
s.log.Info("Adding new member", "id", newID, "location", location, "memberPath", newMemberPath)
if strings.HasSuffix(r.URL.Path, "/Subscriptions") {
w.Header().Set("Location", location)
}
s.overrides[urlPath] = base
s.overrides[newMemberPath] = update
} else {
base.Members = make([]Member, 0)
location := r.URL.JoinPath("1").String()
newMemberPath := resolvePath(location)
base.Members = []Member{
{
OdataID: r.URL.JoinPath("1").String(),
},
}
s.overrides[urlPath] = base
s.overrides[newMemberPath] = update
if strings.HasSuffix(r.URL.Path, "/Subscriptions") {
w.Header().Set("Location", location)
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
s.writeJSON(w, http.StatusCreated, map[string]string{"status": "created"})
}
}
Expand Down Expand Up @@ -178,22 +258,54 @@ func (s *MockServer) handlePatch(w http.ResponseWriter, r *http.Request) {

func (s *MockServer) handleDelete(w http.ResponseWriter, r *http.Request) {
filePath := resolvePath(r.URL.Path)

base, err := s.loadResource(filePath)
if err != nil {
s.handleError(w, r, err)
return
}

if _, isCollection := base["Members"]; isCollection {
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
return
}

s.mu.Lock()
defer s.mu.Unlock()

delete(s.overrides, filePath)
s.mu.Unlock()

// Derive collection path from request URL, not file path
collectionPath := resolvePath(path.Dir(r.URL.Path))

cached, hasOverride := s.overrides[collectionPath]
var collection Collection
if hasOverride {
var ok bool
collection, ok = cached.(Collection)
if !ok {
http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError)
return
}
} else {
data, err := dataFS.ReadFile(collectionPath)
if err != nil {
http.NotFound(w, r)
return
}
if err := json.Unmarshal(data, &collection); err != nil {
http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError)
return
}
}
// remove member from collection
newMembers := make([]Member, 0)
for _, member := range collection.Members {
if member.OdataID != r.URL.Path {
newMembers = append(newMembers, member)
}
}
s.log.Info("Removing member from collection", "members", newMembers, "collection", collectionPath)
collection.Members = newMembers
s.overrides[collectionPath] = collection
w.WriteHeader(http.StatusNoContent)
}

Expand Down
87 changes: 87 additions & 0 deletions bmc/redfish.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"io"
"maps"
"math/big"
"net/url"
"slices"
"strings"
"time"
Expand Down Expand Up @@ -975,3 +976,89 @@ func shuffleRunes(a []rune) error {
}
return nil
}

type subscriptionPayload struct {
Destination string `json:"Destination,omitempty"`
EventTypes []schemas.EventType `json:"EventTypes,omitempty"`
EventFormatType schemas.EventFormatType `json:"EventFormatType,omitempty"`
RegistryPrefixes []string `json:"RegistryPrefixes,omitempty"`
ResourceTypes []string `json:"ResourceTypes,omitempty"`
DeliveryRetryPolicy schemas.DeliveryRetryPolicy `json:"DeliveryRetryPolicy,omitempty"`
HTTPHeaders map[string]string `json:"HttpHeaders,omitempty"`
Oem any `json:"Oem,omitempty"`
Protocol schemas.EventDestinationProtocol `json:"Protocol,omitempty"`
Context string `json:"Context,omitempty"`
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

func (r *RedfishBaseBMC) CreateEventSubscription(
ctx context.Context,
destination string,
eventFormatType schemas.EventFormatType,
deliveryRetryPolicy schemas.DeliveryRetryPolicy,
) (string, error) {
service := r.client.GetService()
ev, err := service.EventService()
if err != nil {
return "", fmt.Errorf("failed to get event service: %w", err)
}
if !ev.ServiceEnabled {
return "", fmt.Errorf("event service is not enabled")
}
payload := &subscriptionPayload{
Destination: destination,
EventFormatType: eventFormatType, // event or metricreport
Protocol: schemas.RedfishEventDestinationProtocol,
DeliveryRetryPolicy: deliveryRetryPolicy,
Context: "metal-operator",
}
client := ev.GetClient()
// some implementations (like Dell) do not support ResourceTypes and RegistryPrefixes
if len(ev.ResourceTypes) == 0 {
payload.EventTypes = []schemas.EventType{}
} else {
payload.RegistryPrefixes = []string{""} // Filters by the prefix of the event's MessageId, which points to a Message Registry: [Base, ResourceEvent, iLOEvents]
payload.ResourceTypes = []string{""} // Filters by the schema name (Resource Type) of the event's OriginOfCondition: [Chassis, ComputerSystem, Power]
}
Comment thread
stefanhipfel marked this conversation as resolved.
resp, err := client.Post(ev.SubscriptionsLink, payload)
if err != nil {
return "", err
}
defer func() {
_ = resp.Body.Close()
}()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", fmt.Errorf("failed to create event subscription status code: %d", resp.StatusCode)
}
// return subscription link from returned location
subscriptionLink := resp.Header.Get("Location")
if subscriptionLink == "" {
return "", fmt.Errorf("failed to get subscription link from response header")
}
urlParser, err := url.ParseRequestURI(subscriptionLink)
if err == nil {
subscriptionLink = urlParser.RequestURI()
}
return subscriptionLink, nil
}

func (r *RedfishBaseBMC) DeleteEventSubscription(ctx context.Context, uri string) error {
service := r.client.GetService()
ev, err := service.EventService()
if err != nil {
return fmt.Errorf("failed to get event service: %w", err)
}
if !ev.ServiceEnabled {
return fmt.Errorf("event service is not enabled")
}
event, err := ev.GetEventSubscription(uri)
if err != nil {
return fmt.Errorf("failed to get event subscription: %w", err)
}
if event == nil {
return nil
}
if err := ev.DeleteEventSubscription(uri); err != nil {
return fmt.Errorf("failed to delete event subscription: %w", err)
}
return nil
}
40 changes: 40 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/ironcore-dev/controller-utils/conditionutils"
"github.com/ironcore-dev/metal-operator/internal/cmd/dns"
"github.com/ironcore-dev/metal-operator/internal/serverevents"
webhookv1alpha1 "github.com/ironcore-dev/metal-operator/internal/webhook/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/manager"

Expand Down Expand Up @@ -76,6 +77,9 @@ func main() { // nolint: gocyclo
registryPort int
registryProtocol string
registryURL string
eventPort int
eventURL string
eventProtocol string
registryResyncInterval time.Duration
webhookPort int
enforceFirstBoot bool
Expand Down Expand Up @@ -125,6 +129,10 @@ func main() { // nolint: gocyclo
flag.StringVar(&registryURL, "registry-url", "", "The URL of the registry.")
flag.StringVar(&registryProtocol, "registry-protocol", "http", "The protocol to use for the registry.")
flag.IntVar(&registryPort, "registry-port", 10000, "The port to use for the registry.")
flag.StringVar(&eventURL, "event-url", "", "The URL of the server events endpoint for alerts and metrics.")
flag.IntVar(&eventPort, "event-port", 10001, "The port to use for the server events endpoint for alerts and metrics.")
flag.StringVar(&eventProtocol, "event-protocol", "http",
"The protocol to use for the server events endpoint for alerts and metrics.")
flag.StringVar(&probeImage, "probe-image", "", "Image for the first boot probing of a Server.")
flag.StringVar(&probeOSImage, "probe-os-image", "", "OS image for the first boot probing of a Server.")
flag.StringVar(&managerNamespace, "manager-namespace", "default", "Namespace the manager is running in.")
Expand Down Expand Up @@ -210,6 +218,17 @@ func main() { // nolint: gocyclo
registryURL = fmt.Sprintf("%s://%s:%d", registryProtocol, registryAddr, registryPort)
}

// set the correct event URL by getting the address from the environment
var eventAddr string
if eventURL == "" {
eventAddr = os.Getenv("EVENT_ADDRESS")
if eventAddr == "" {
setupLog.Error(nil, "failed to set the event URL as no address is provided")
} else {
eventURL = fmt.Sprintf("%s://%s:%d", eventProtocol, eventAddr, eventPort)
}
}
Comment on lines +221 to +230
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Inconsistent logging level: use Info instead of Error for optional event URL.

Line 221 uses setupLog.Error(nil, ...) to report that the event URL is not configured. Since event functionality is intentionally optional (unlike the required registry URL which triggers os.Exit(1)), this should be an informational message to avoid confusing operators who don't use the event feature.

Suggested fix
 	var eventAddr string
 	if eventURL == "" {
 		eventAddr = os.Getenv("EVENT_ADDRESS")
 		if eventAddr == "" {
-			setupLog.Error(nil, "failed to set the event URL as no address is provided")
+			setupLog.Info("Event URL not configured, event subscriptions will be disabled")
 		} else {
 			eventURL = fmt.Sprintf("%s://%s:%d", eventProtocol, eventAddr, eventPort)
 		}
 	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// set the correct event URL by getting the address from the environment
var eventAddr string
if eventURL == "" {
eventAddr = os.Getenv("EVENT_ADDRESS")
if eventAddr == "" {
setupLog.Error(nil, "failed to set the event URL as no address is provided")
} else {
eventURL = fmt.Sprintf("%s://%s:%d", eventProtocol, eventAddr, eventPort)
}
}
// set the correct event URL by getting the address from the environment
var eventAddr string
if eventURL == "" {
eventAddr = os.Getenv("EVENT_ADDRESS")
if eventAddr == "" {
setupLog.Info("Event URL not configured, event subscriptions will be disabled")
} else {
eventURL = fmt.Sprintf("%s://%s:%d", eventProtocol, eventAddr, eventPort)
}
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cmd/main.go` around lines 216 - 225, The log for missing optional event URL
should be informational, not an error: in the block that computes eventURL
(variables eventURL, eventAddr, eventProtocol, eventPort) replace
setupLog.Error(nil, "...") with setupLog.Info(...) and keep the existing
message/context and no process exit; ensure the branch that constructs eventURL
(fmt.Sprintf(...)) remains unchanged so optional behavior is preserved.


// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancelation and
Expand Down Expand Up @@ -355,6 +374,7 @@ func main() { // nolint: gocyclo
BMCResetWaitTime: bmcResetWaitingInterval,
BMCClientRetryInterval: bmcResetResyncInterval,
ManagerNamespace: managerNamespace,
EventURL: eventURL,
DNSRecordTemplate: dnsRecordTemplate,
Conditions: conditionutils.NewAccessor(conditionutils.AccessorOptions{}),
BMCOptions: bmc.Options{
Expand Down Expand Up @@ -615,6 +635,26 @@ func main() { // nolint: gocyclo
os.Exit(1)
}

if eventURL != "" {
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
setupLog.Info("starting event server for alerts and metrics", "EventURL", eventURL)
eventServer := serverevents.NewServer(setupLog, fmt.Sprintf(":%d", eventPort))
eventServer.SetClient(mgr.GetClient())

criticalEventHandler := serverevents.CreateCriticalEventHandler(mgr.GetClient(), setupLog)
eventServer.SetCriticalEventHandler(criticalEventHandler)

if err := eventServer.Start(ctx); err != nil {
return fmt.Errorf("unable to start event server: %w", err)
}
<-ctx.Done()
return nil
})); err != nil {
setupLog.Error(err, "unable to add event runnable to manager")
os.Exit(1)
}
}

setupLog.Info("Starting manager")
if err := mgr.Start(ctx); err != nil {
setupLog.Error(err, "Failed to run manager")
Expand Down
Loading
Loading