From 3cd93c4b7d68e7488d14c43016f306f7c5c5a521 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 14:09:41 +1100 Subject: [PATCH 01/20] add poolworker database type, to genericise what the Monitor does --- pkg/api/poolworker.go | 19 + pkg/api/poolworkerdocument.go | 38 ++ pkg/database/cosmosdb/generate.go | 2 +- .../zz_generated_poolworkerdocument.go | 313 ++++++++++++++ .../zz_generated_poolworkerdocument_fake.go | 389 ++++++++++++++++++ pkg/database/database.go | 1 + pkg/database/dbgroup.go | 19 + pkg/database/poolworkers.go | 182 ++++++++ test/database/inmemory.go | 11 + test/database/poolworkers.go | 82 ++++ 10 files changed, 1055 insertions(+), 1 deletion(-) create mode 100644 pkg/api/poolworker.go create mode 100644 pkg/api/poolworkerdocument.go create mode 100644 pkg/database/cosmosdb/zz_generated_poolworkerdocument.go create mode 100644 pkg/database/cosmosdb/zz_generated_poolworkerdocument_fake.go create mode 100644 pkg/database/poolworkers.go create mode 100644 test/database/poolworkers.go diff --git a/pkg/api/poolworker.go b/pkg/api/poolworker.go new file mode 100644 index 00000000000..57105f219b2 --- /dev/null +++ b/pkg/api/poolworker.go @@ -0,0 +1,19 @@ +package api + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +type PoolWorkerType string + +const ( + PoolWorkerTypeMIMOActuator PoolWorkerType = "mimo-actuator" + PoolWorkerTypeMIMOScheduler PoolWorkerType = "mimo-scheduler" +) + +// PoolWorker represents a worker in a pool that distributes work via owning +// different OpenShiftCluster buckets +type PoolWorker struct { + MissingFields + + Buckets []string `json:"buckets,omitempty"` +} diff --git a/pkg/api/poolworkerdocument.go b/pkg/api/poolworkerdocument.go new file mode 100644 index 00000000000..6f8031ac9bd --- /dev/null +++ b/pkg/api/poolworkerdocument.go @@ -0,0 +1,38 @@ +package api + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +// PoolWorkerDocuments represents PoolWorker documents. +// pkg/database/cosmosdb requires its definition. +type PoolWorkerDocuments struct { + Count int `json:"_count,omitempty"` + ResourceID string `json:"_rid,omitempty"` + PoolWorkerDocuments []*PoolWorkerDocument `json:"Documents,omitempty"` +} + +// PoolWorkerDocument represents a PoolWorker document. +// pkg/database/cosmosdb requires its definition. +type PoolWorkerDocument struct { + MissingFields + + ID string `json:"id,omitempty"` + ResourceID string `json:"_rid,omitempty"` + Timestamp int `json:"_ts,omitempty"` + Self string `json:"_self,omitempty"` + ETag string `json:"_etag,omitempty" deep:"-"` + Attachments string `json:"_attachments,omitempty"` + TTL int `json:"ttl,omitempty"` + LSN int `json:"_lsn,omitempty"` + Metadata map[string]interface{} `json:"_metadata,omitempty"` + + LeaseOwner string `json:"leaseOwner,omitempty"` + LeaseExpires int `json:"leaseExpires,omitempty"` + + WorkerType PoolWorkerType `json:"workerType,omitempty"` + PoolWorker *PoolWorker `json:"poolWorker,omitempty"` +} + +func (c *PoolWorkerDocument) GetID() string { + return c.ID +} diff --git a/pkg/database/cosmosdb/generate.go b/pkg/database/cosmosdb/generate.go index c55569a54a8..ae95676e63b 100644 --- a/pkg/database/cosmosdb/generate.go +++ b/pkg/database/cosmosdb/generate.go @@ -3,5 +3,5 @@ package cosmosdb // Copyright (c) Microsoft Corporation. // Licensed under the Apache License 2.0. -//go:generate gencosmosdb github.com/Azure/ARO-RP/pkg/api,AsyncOperationDocument github.com/Azure/ARO-RP/pkg/api,BillingDocument github.com/Azure/ARO-RP/pkg/api,GatewayDocument github.com/Azure/ARO-RP/pkg/api,MonitorDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftClusterDocument github.com/Azure/ARO-RP/pkg/api,SubscriptionDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftVersionDocument github.com/Azure/ARO-RP/pkg/api,PlatformWorkloadIdentityRoleSetDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceManifestDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceScheduleDocument +//go:generate gencosmosdb github.com/Azure/ARO-RP/pkg/api,AsyncOperationDocument github.com/Azure/ARO-RP/pkg/api,BillingDocument github.com/Azure/ARO-RP/pkg/api,GatewayDocument github.com/Azure/ARO-RP/pkg/api,MonitorDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftClusterDocument github.com/Azure/ARO-RP/pkg/api,SubscriptionDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftVersionDocument github.com/Azure/ARO-RP/pkg/api,PlatformWorkloadIdentityRoleSetDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceManifestDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceScheduleDocument github.com/Azure/ARO-RP/pkg/api,PoolWorkerDocument //go:generate mockgen -destination=../../util/mocks/$GOPACKAGE/$GOPACKAGE.go github.com/Azure/ARO-RP/pkg/database/$GOPACKAGE PermissionClient diff --git a/pkg/database/cosmosdb/zz_generated_poolworkerdocument.go b/pkg/database/cosmosdb/zz_generated_poolworkerdocument.go new file mode 100644 index 00000000000..26e5dc965ce --- /dev/null +++ b/pkg/database/cosmosdb/zz_generated_poolworkerdocument.go @@ -0,0 +1,313 @@ +// Code generated by github.com/bennerv/go-cosmosdb, DO NOT EDIT. + +package cosmosdb + +import ( + "context" + "net/http" + "strconv" + "strings" + + pkg "github.com/Azure/ARO-RP/pkg/api" +) + +type poolWorkerDocumentClient struct { + *databaseClient + path string +} + +// PoolWorkerDocumentClient is a poolWorkerDocument client +type PoolWorkerDocumentClient interface { + Create(context.Context, string, *pkg.PoolWorkerDocument, *Options) (*pkg.PoolWorkerDocument, error) + List(*Options) PoolWorkerDocumentIterator + ListAll(context.Context, *Options) (*pkg.PoolWorkerDocuments, error) + Get(context.Context, string, string, *Options) (*pkg.PoolWorkerDocument, error) + Replace(context.Context, string, *pkg.PoolWorkerDocument, *Options) (*pkg.PoolWorkerDocument, error) + Delete(context.Context, string, *pkg.PoolWorkerDocument, *Options) error + Query(string, *Query, *Options) PoolWorkerDocumentRawIterator + QueryAll(context.Context, string, *Query, *Options) (*pkg.PoolWorkerDocuments, error) + ChangeFeed(*Options) PoolWorkerDocumentIterator +} + +type poolWorkerDocumentChangeFeedIterator struct { + *poolWorkerDocumentClient + continuation string + options *Options +} + +type poolWorkerDocumentListIterator struct { + *poolWorkerDocumentClient + continuation string + done bool + options *Options +} + +type poolWorkerDocumentQueryIterator struct { + *poolWorkerDocumentClient + partitionkey string + query *Query + continuation string + done bool + options *Options +} + +// PoolWorkerDocumentIterator is a poolWorkerDocument iterator +type PoolWorkerDocumentIterator interface { + Next(context.Context, int) (*pkg.PoolWorkerDocuments, error) + Continuation() string +} + +// PoolWorkerDocumentRawIterator is a poolWorkerDocument raw iterator +type PoolWorkerDocumentRawIterator interface { + PoolWorkerDocumentIterator + NextRaw(context.Context, int, interface{}) error +} + +// NewPoolWorkerDocumentClient returns a new poolWorkerDocument client +func NewPoolWorkerDocumentClient(collc CollectionClient, collid string) PoolWorkerDocumentClient { + return &poolWorkerDocumentClient{ + databaseClient: collc.(*collectionClient).databaseClient, + path: collc.(*collectionClient).path + "/colls/" + collid, + } +} + +func (c *poolWorkerDocumentClient) all(ctx context.Context, i PoolWorkerDocumentIterator) (*pkg.PoolWorkerDocuments, error) { + allpoolWorkerDocuments := &pkg.PoolWorkerDocuments{} + + for { + poolWorkerDocuments, err := i.Next(ctx, -1) + if err != nil { + return nil, err + } + if poolWorkerDocuments == nil { + break + } + + allpoolWorkerDocuments.Count += poolWorkerDocuments.Count + allpoolWorkerDocuments.ResourceID = poolWorkerDocuments.ResourceID + allpoolWorkerDocuments.PoolWorkerDocuments = append(allpoolWorkerDocuments.PoolWorkerDocuments, poolWorkerDocuments.PoolWorkerDocuments...) + } + + return allpoolWorkerDocuments, nil +} + +func (c *poolWorkerDocumentClient) Create(ctx context.Context, partitionkey string, newpoolWorkerDocument *pkg.PoolWorkerDocument, options *Options) (poolWorkerDocument *pkg.PoolWorkerDocument, err error) { + headers := http.Header{} + headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) + + if options == nil { + options = &Options{} + } + options.NoETag = true + + err = c.setOptions(options, newpoolWorkerDocument, headers) + if err != nil { + return + } + + err = c.do(ctx, http.MethodPost, c.path+"/docs", "docs", c.path, http.StatusCreated, &newpoolWorkerDocument, &poolWorkerDocument, headers) + return +} + +func (c *poolWorkerDocumentClient) List(options *Options) PoolWorkerDocumentIterator { + continuation := "" + if options != nil { + continuation = options.Continuation + } + + return &poolWorkerDocumentListIterator{poolWorkerDocumentClient: c, options: options, continuation: continuation} +} + +func (c *poolWorkerDocumentClient) ListAll(ctx context.Context, options *Options) (*pkg.PoolWorkerDocuments, error) { + return c.all(ctx, c.List(options)) +} + +func (c *poolWorkerDocumentClient) Get(ctx context.Context, partitionkey, poolWorkerDocumentid string, options *Options) (poolWorkerDocument *pkg.PoolWorkerDocument, err error) { + headers := http.Header{} + headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) + + err = c.setOptions(options, nil, headers) + if err != nil { + return + } + + err = c.do(ctx, http.MethodGet, c.path+"/docs/"+poolWorkerDocumentid, "docs", c.path+"/docs/"+poolWorkerDocumentid, http.StatusOK, nil, &poolWorkerDocument, headers) + return +} + +func (c *poolWorkerDocumentClient) Replace(ctx context.Context, partitionkey string, newpoolWorkerDocument *pkg.PoolWorkerDocument, options *Options) (poolWorkerDocument *pkg.PoolWorkerDocument, err error) { + headers := http.Header{} + headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) + + err = c.setOptions(options, newpoolWorkerDocument, headers) + if err != nil { + return + } + + err = c.do(ctx, http.MethodPut, c.path+"/docs/"+newpoolWorkerDocument.ID, "docs", c.path+"/docs/"+newpoolWorkerDocument.ID, http.StatusOK, &newpoolWorkerDocument, &poolWorkerDocument, headers) + return +} + +func (c *poolWorkerDocumentClient) Delete(ctx context.Context, partitionkey string, poolWorkerDocument *pkg.PoolWorkerDocument, options *Options) (err error) { + headers := http.Header{} + headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) + + err = c.setOptions(options, poolWorkerDocument, headers) + if err != nil { + return + } + + err = c.do(ctx, http.MethodDelete, c.path+"/docs/"+poolWorkerDocument.ID, "docs", c.path+"/docs/"+poolWorkerDocument.ID, http.StatusNoContent, nil, nil, headers) + return +} + +func (c *poolWorkerDocumentClient) Query(partitionkey string, query *Query, options *Options) PoolWorkerDocumentRawIterator { + continuation := "" + if options != nil { + continuation = options.Continuation + } + + return &poolWorkerDocumentQueryIterator{poolWorkerDocumentClient: c, partitionkey: partitionkey, query: query, options: options, continuation: continuation} +} + +func (c *poolWorkerDocumentClient) QueryAll(ctx context.Context, partitionkey string, query *Query, options *Options) (*pkg.PoolWorkerDocuments, error) { + return c.all(ctx, c.Query(partitionkey, query, options)) +} + +func (c *poolWorkerDocumentClient) ChangeFeed(options *Options) PoolWorkerDocumentIterator { + continuation := "" + if options != nil { + continuation = options.Continuation + } + + return &poolWorkerDocumentChangeFeedIterator{poolWorkerDocumentClient: c, options: options, continuation: continuation} +} + +func (c *poolWorkerDocumentClient) setOptions(options *Options, poolWorkerDocument *pkg.PoolWorkerDocument, headers http.Header) error { + if options == nil { + return nil + } + + if poolWorkerDocument != nil && !options.NoETag { + if poolWorkerDocument.ETag == "" { + return ErrETagRequired + } + headers.Set("If-Match", poolWorkerDocument.ETag) + } + if len(options.PreTriggers) > 0 { + headers.Set("X-Ms-Documentdb-Pre-Trigger-Include", strings.Join(options.PreTriggers, ",")) + } + if len(options.PostTriggers) > 0 { + headers.Set("X-Ms-Documentdb-Post-Trigger-Include", strings.Join(options.PostTriggers, ",")) + } + if len(options.PartitionKeyRangeID) > 0 { + headers.Set("X-Ms-Documentdb-PartitionKeyRangeID", options.PartitionKeyRangeID) + } + + return nil +} + +func (i *poolWorkerDocumentChangeFeedIterator) Next(ctx context.Context, maxItemCount int) (poolWorkerDocuments *pkg.PoolWorkerDocuments, err error) { + headers := http.Header{} + headers.Set("A-IM", "Incremental feed") + + headers.Set("X-Ms-Max-Item-Count", strconv.Itoa(maxItemCount)) + if i.continuation != "" { + headers.Set("If-None-Match", i.continuation) + } + + err = i.setOptions(i.options, nil, headers) + if err != nil { + return + } + + err = i.do(ctx, http.MethodGet, i.path+"/docs", "docs", i.path, http.StatusOK, nil, &poolWorkerDocuments, headers) + if IsErrorStatusCode(err, http.StatusNotModified) { + err = nil + } + if err != nil { + return + } + + i.continuation = headers.Get("Etag") + + return +} + +func (i *poolWorkerDocumentChangeFeedIterator) Continuation() string { + return i.continuation +} + +func (i *poolWorkerDocumentListIterator) Next(ctx context.Context, maxItemCount int) (poolWorkerDocuments *pkg.PoolWorkerDocuments, err error) { + if i.done { + return + } + + headers := http.Header{} + headers.Set("X-Ms-Max-Item-Count", strconv.Itoa(maxItemCount)) + if i.continuation != "" { + headers.Set("X-Ms-Continuation", i.continuation) + } + + err = i.setOptions(i.options, nil, headers) + if err != nil { + return + } + + err = i.do(ctx, http.MethodGet, i.path+"/docs", "docs", i.path, http.StatusOK, nil, &poolWorkerDocuments, headers) + if err != nil { + return + } + + i.continuation = headers.Get("X-Ms-Continuation") + i.done = i.continuation == "" + + return +} + +func (i *poolWorkerDocumentListIterator) Continuation() string { + return i.continuation +} + +func (i *poolWorkerDocumentQueryIterator) Next(ctx context.Context, maxItemCount int) (poolWorkerDocuments *pkg.PoolWorkerDocuments, err error) { + err = i.NextRaw(ctx, maxItemCount, &poolWorkerDocuments) + return +} + +func (i *poolWorkerDocumentQueryIterator) NextRaw(ctx context.Context, maxItemCount int, raw interface{}) (err error) { + if i.done { + return + } + + headers := http.Header{} + headers.Set("X-Ms-Max-Item-Count", strconv.Itoa(maxItemCount)) + headers.Set("X-Ms-Documentdb-Isquery", "True") + headers.Set("Content-Type", "application/query+json") + if i.partitionkey != "" { + headers.Set("X-Ms-Documentdb-Partitionkey", `["`+i.partitionkey+`"]`) + } else { + headers.Set("X-Ms-Documentdb-Query-Enablecrosspartition", "True") + } + if i.continuation != "" { + headers.Set("X-Ms-Continuation", i.continuation) + } + + err = i.setOptions(i.options, nil, headers) + if err != nil { + return + } + + err = i.do(ctx, http.MethodPost, i.path+"/docs", "docs", i.path, http.StatusOK, &i.query, &raw, headers) + if err != nil { + return + } + + i.continuation = headers.Get("X-Ms-Continuation") + i.done = i.continuation == "" + + return +} + +func (i *poolWorkerDocumentQueryIterator) Continuation() string { + return i.continuation +} diff --git a/pkg/database/cosmosdb/zz_generated_poolworkerdocument_fake.go b/pkg/database/cosmosdb/zz_generated_poolworkerdocument_fake.go new file mode 100644 index 00000000000..f4cbb04030f --- /dev/null +++ b/pkg/database/cosmosdb/zz_generated_poolworkerdocument_fake.go @@ -0,0 +1,389 @@ +// Code generated by github.com/bennerv/go-cosmosdb, DO NOT EDIT. + +package cosmosdb + +import ( + "context" + "fmt" + "net/http" + "sync" + + "github.com/ugorji/go/codec" + + pkg "github.com/Azure/ARO-RP/pkg/api" +) + +type ( + fakePoolWorkerDocumentTriggerHandler func(context.Context, *pkg.PoolWorkerDocument) error + fakePoolWorkerDocumentQueryHandler func(PoolWorkerDocumentClient, *Query, *Options) PoolWorkerDocumentRawIterator +) + +var _ PoolWorkerDocumentClient = &FakePoolWorkerDocumentClient{} + +// NewFakePoolWorkerDocumentClient returns a FakePoolWorkerDocumentClient +func NewFakePoolWorkerDocumentClient(h *codec.JsonHandle) *FakePoolWorkerDocumentClient { + return &FakePoolWorkerDocumentClient{ + jsonHandle: h, + poolWorkerDocuments: make(map[string]*pkg.PoolWorkerDocument), + triggerHandlers: make(map[string]fakePoolWorkerDocumentTriggerHandler), + queryHandlers: make(map[string]fakePoolWorkerDocumentQueryHandler), + } +} + +// FakePoolWorkerDocumentClient is a FakePoolWorkerDocumentClient +type FakePoolWorkerDocumentClient struct { + lock sync.RWMutex + jsonHandle *codec.JsonHandle + poolWorkerDocuments map[string]*pkg.PoolWorkerDocument + triggerHandlers map[string]fakePoolWorkerDocumentTriggerHandler + queryHandlers map[string]fakePoolWorkerDocumentQueryHandler + sorter func([]*pkg.PoolWorkerDocument) + etag int + changeFeedIterators []*fakePoolWorkerDocumentIterator + + // returns true if documents conflict + conflictChecker func(*pkg.PoolWorkerDocument, *pkg.PoolWorkerDocument) bool + + // err, if not nil, is an error to return when attempting to communicate + // with this Client + err error +} + +// SetError sets or unsets an error that will be returned on any +// FakePoolWorkerDocumentClient method invocation +func (c *FakePoolWorkerDocumentClient) SetError(err error) { + c.lock.Lock() + defer c.lock.Unlock() + + c.err = err +} + +// SetSorter sets or unsets a sorter function which will be used to sort values +// returned by List() for test stability +func (c *FakePoolWorkerDocumentClient) SetSorter(sorter func([]*pkg.PoolWorkerDocument)) { + c.lock.Lock() + defer c.lock.Unlock() + + c.sorter = sorter +} + +// SetConflictChecker sets or unsets a function which can be used to validate +// additional unique keys in a PoolWorkerDocument +func (c *FakePoolWorkerDocumentClient) SetConflictChecker(conflictChecker func(*pkg.PoolWorkerDocument, *pkg.PoolWorkerDocument) bool) { + c.lock.Lock() + defer c.lock.Unlock() + + c.conflictChecker = conflictChecker +} + +// SetTriggerHandler sets or unsets a trigger handler +func (c *FakePoolWorkerDocumentClient) SetTriggerHandler(triggerName string, trigger fakePoolWorkerDocumentTriggerHandler) { + c.lock.Lock() + defer c.lock.Unlock() + + c.triggerHandlers[triggerName] = trigger +} + +// SetQueryHandler sets or unsets a query handler +func (c *FakePoolWorkerDocumentClient) SetQueryHandler(queryName string, query fakePoolWorkerDocumentQueryHandler) { + c.lock.Lock() + defer c.lock.Unlock() + + c.queryHandlers[queryName] = query +} + +func (c *FakePoolWorkerDocumentClient) deepCopy(poolWorkerDocument *pkg.PoolWorkerDocument) (*pkg.PoolWorkerDocument, error) { + var b []byte + err := codec.NewEncoderBytes(&b, c.jsonHandle).Encode(poolWorkerDocument) + if err != nil { + return nil, err + } + + poolWorkerDocument = nil + err = codec.NewDecoderBytes(b, c.jsonHandle).Decode(&poolWorkerDocument) + if err != nil { + return nil, err + } + + return poolWorkerDocument, nil +} + +func (c *FakePoolWorkerDocumentClient) apply(ctx context.Context, partitionkey string, poolWorkerDocument *pkg.PoolWorkerDocument, options *Options, isCreate bool) (*pkg.PoolWorkerDocument, error) { + c.lock.Lock() + defer c.lock.Unlock() + + if c.err != nil { + return nil, c.err + } + + poolWorkerDocument, err := c.deepCopy(poolWorkerDocument) // copy now because pretriggers can mutate poolWorkerDocument + if err != nil { + return nil, err + } + + if options != nil { + err := c.processPreTriggers(ctx, poolWorkerDocument, options) + if err != nil { + return nil, err + } + } + + existingPoolWorkerDocument, exists := c.poolWorkerDocuments[poolWorkerDocument.ID] + if isCreate && exists { + return nil, &Error{ + StatusCode: http.StatusConflict, + Message: "Entity with the specified id already exists in the system", + } + } + if !isCreate { + if !exists { + return nil, &Error{StatusCode: http.StatusNotFound} + } + + if (options == nil || !options.NoETag) && poolWorkerDocument.ETag != existingPoolWorkerDocument.ETag { + return nil, &Error{StatusCode: http.StatusPreconditionFailed} + } + } + + if c.conflictChecker != nil { + for _, poolWorkerDocumentToCheck := range c.poolWorkerDocuments { + if c.conflictChecker(poolWorkerDocumentToCheck, poolWorkerDocument) { + return nil, &Error{ + StatusCode: http.StatusConflict, + Message: "Entity with the specified id already exists in the system", + } + } + } + } + + poolWorkerDocument.ETag = fmt.Sprint(c.etag) + c.etag++ + + c.poolWorkerDocuments[poolWorkerDocument.ID] = poolWorkerDocument + + if err = c.updateChangeFeeds(poolWorkerDocument); err != nil { + return nil, err + } + + return c.deepCopy(poolWorkerDocument) +} + +// Create creates a PoolWorkerDocument in the database +func (c *FakePoolWorkerDocumentClient) Create(ctx context.Context, partitionkey string, poolWorkerDocument *pkg.PoolWorkerDocument, options *Options) (*pkg.PoolWorkerDocument, error) { + return c.apply(ctx, partitionkey, poolWorkerDocument, options, true) +} + +// Replace replaces a PoolWorkerDocument in the database +func (c *FakePoolWorkerDocumentClient) Replace(ctx context.Context, partitionkey string, poolWorkerDocument *pkg.PoolWorkerDocument, options *Options) (*pkg.PoolWorkerDocument, error) { + return c.apply(ctx, partitionkey, poolWorkerDocument, options, false) +} + +// List returns a PoolWorkerDocumentIterator to list all PoolWorkerDocuments in the database +func (c *FakePoolWorkerDocumentClient) List(*Options) PoolWorkerDocumentIterator { + c.lock.RLock() + defer c.lock.RUnlock() + + if c.err != nil { + return NewFakePoolWorkerDocumentErroringRawIterator(c.err) + } + + poolWorkerDocuments := make([]*pkg.PoolWorkerDocument, 0, len(c.poolWorkerDocuments)) + for _, poolWorkerDocument := range c.poolWorkerDocuments { + poolWorkerDocument, err := c.deepCopy(poolWorkerDocument) + if err != nil { + return NewFakePoolWorkerDocumentErroringRawIterator(err) + } + poolWorkerDocuments = append(poolWorkerDocuments, poolWorkerDocument) + } + + if c.sorter != nil { + c.sorter(poolWorkerDocuments) + } + + return NewFakePoolWorkerDocumentIterator(poolWorkerDocuments, 0) +} + +// ListAll lists all PoolWorkerDocuments in the database +func (c *FakePoolWorkerDocumentClient) ListAll(ctx context.Context, options *Options) (*pkg.PoolWorkerDocuments, error) { + iter := c.List(options) + return iter.Next(ctx, -1) +} + +// Get gets a PoolWorkerDocument from the database +func (c *FakePoolWorkerDocumentClient) Get(ctx context.Context, partitionkey string, id string, options *Options) (*pkg.PoolWorkerDocument, error) { + c.lock.RLock() + defer c.lock.RUnlock() + + if c.err != nil { + return nil, c.err + } + + poolWorkerDocument, exists := c.poolWorkerDocuments[id] + if !exists { + return nil, &Error{StatusCode: http.StatusNotFound} + } + + return c.deepCopy(poolWorkerDocument) +} + +// Delete deletes a PoolWorkerDocument from the database +func (c *FakePoolWorkerDocumentClient) Delete(ctx context.Context, partitionKey string, poolWorkerDocument *pkg.PoolWorkerDocument, options *Options) error { + c.lock.Lock() + defer c.lock.Unlock() + + if c.err != nil { + return c.err + } + + _, exists := c.poolWorkerDocuments[poolWorkerDocument.ID] + if !exists { + return &Error{StatusCode: http.StatusNotFound} + } + + delete(c.poolWorkerDocuments, poolWorkerDocument.ID) + return nil +} + +// ChangeFeed is a basic implementation of cosmosDB Changefeeds. Compared to the real changefeeds, its implementation is much more simplistic: +// - Deleting a PoolWorkerDocument does not remove it from the existing change feeds +// - when a PoolWorkerDocument is pushed into the changefeed, older versions that have not been retrieved won't be removed, meaning there's no guarantee that a poolWorkerDocument from the changefeed is actually the most recent version. +func (c *FakePoolWorkerDocumentClient) ChangeFeed(*Options) PoolWorkerDocumentIterator { + c.lock.RLock() + defer c.lock.RUnlock() + + if c.err != nil { + return NewFakePoolWorkerDocumentErroringRawIterator(c.err) + } + + newIter, ok := c.List(nil).(*fakePoolWorkerDocumentIterator) + if !ok { + return NewFakePoolWorkerDocumentErroringRawIterator(fmt.Errorf("internal error")) + } + + c.changeFeedIterators = append(c.changeFeedIterators, newIter) + return newIter +} + +func (c *FakePoolWorkerDocumentClient) updateChangeFeeds(poolWorkerDocument *pkg.PoolWorkerDocument) error { + for _, currentIterator := range c.changeFeedIterators { + newTpl, err := c.deepCopy(poolWorkerDocument) + if err != nil { + return err + } + + currentIterator.poolWorkerDocuments = append(currentIterator.poolWorkerDocuments, newTpl) + currentIterator.done = false + } + return nil +} + +func (c *FakePoolWorkerDocumentClient) processPreTriggers(ctx context.Context, poolWorkerDocument *pkg.PoolWorkerDocument, options *Options) error { + for _, triggerName := range options.PreTriggers { + if triggerHandler := c.triggerHandlers[triggerName]; triggerHandler != nil { + c.lock.Unlock() + err := triggerHandler(ctx, poolWorkerDocument) + c.lock.Lock() + if err != nil { + return err + } + } else { + return ErrNotImplemented + } + } + + return nil +} + +// Query calls a query handler to implement database querying +func (c *FakePoolWorkerDocumentClient) Query(name string, query *Query, options *Options) PoolWorkerDocumentRawIterator { + c.lock.RLock() + defer c.lock.RUnlock() + + if c.err != nil { + return NewFakePoolWorkerDocumentErroringRawIterator(c.err) + } + + if queryHandler := c.queryHandlers[query.Query]; queryHandler != nil { + c.lock.RUnlock() + i := queryHandler(c, query, options) + c.lock.RLock() + return i + } + + return NewFakePoolWorkerDocumentErroringRawIterator(ErrNotImplemented) +} + +// QueryAll calls a query handler to implement database querying +func (c *FakePoolWorkerDocumentClient) QueryAll(ctx context.Context, partitionkey string, query *Query, options *Options) (*pkg.PoolWorkerDocuments, error) { + iter := c.Query("", query, options) + return iter.Next(ctx, -1) +} + +func NewFakePoolWorkerDocumentIterator(poolWorkerDocuments []*pkg.PoolWorkerDocument, continuation int) PoolWorkerDocumentRawIterator { + return &fakePoolWorkerDocumentIterator{poolWorkerDocuments: poolWorkerDocuments, continuation: continuation} +} + +type fakePoolWorkerDocumentIterator struct { + poolWorkerDocuments []*pkg.PoolWorkerDocument + continuation int + done bool +} + +func (i *fakePoolWorkerDocumentIterator) NextRaw(ctx context.Context, maxItemCount int, out interface{}) error { + return ErrNotImplemented +} + +func (i *fakePoolWorkerDocumentIterator) Next(ctx context.Context, maxItemCount int) (*pkg.PoolWorkerDocuments, error) { + if i.done { + return nil, nil + } + + var poolWorkerDocuments []*pkg.PoolWorkerDocument + if maxItemCount == -1 { + poolWorkerDocuments = i.poolWorkerDocuments[i.continuation:] + i.continuation = len(i.poolWorkerDocuments) + i.done = true + } else { + max := i.continuation + maxItemCount + if max > len(i.poolWorkerDocuments) { + max = len(i.poolWorkerDocuments) + } + poolWorkerDocuments = i.poolWorkerDocuments[i.continuation:max] + i.continuation = max + i.done = i.Continuation() == "" + } + + return &pkg.PoolWorkerDocuments{ + PoolWorkerDocuments: poolWorkerDocuments, + Count: len(poolWorkerDocuments), + }, nil +} + +func (i *fakePoolWorkerDocumentIterator) Continuation() string { + if i.continuation >= len(i.poolWorkerDocuments) { + return "" + } + return fmt.Sprintf("%d", i.continuation) +} + +// NewFakePoolWorkerDocumentErroringRawIterator returns a PoolWorkerDocumentRawIterator which +// whose methods return the given error +func NewFakePoolWorkerDocumentErroringRawIterator(err error) PoolWorkerDocumentRawIterator { + return &fakePoolWorkerDocumentErroringRawIterator{err: err} +} + +type fakePoolWorkerDocumentErroringRawIterator struct { + err error +} + +func (i *fakePoolWorkerDocumentErroringRawIterator) Next(ctx context.Context, maxItemCount int) (*pkg.PoolWorkerDocuments, error) { + return nil, i.err +} + +func (i *fakePoolWorkerDocumentErroringRawIterator) NextRaw(context.Context, int, interface{}) error { + return i.err +} + +func (i *fakePoolWorkerDocumentErroringRawIterator) Continuation() string { + return "" +} diff --git a/pkg/database/database.go b/pkg/database/database.go index 27236817ef3..de18f0c9b35 100644 --- a/pkg/database/database.go +++ b/pkg/database/database.go @@ -37,6 +37,7 @@ const ( collSubscriptions = "Subscriptions" collMaintenanceManifests = "MaintenanceManifests" collMaintenanceSchedules = "MaintenanceSchedules" + collPoolWorkers = "PoolWorkers" ) type IDable interface { diff --git a/pkg/database/dbgroup.go b/pkg/database/dbgroup.go index 3c885d906be..9ed475635a5 100644 --- a/pkg/database/dbgroup.go +++ b/pkg/database/dbgroup.go @@ -45,6 +45,10 @@ type DatabaseGroupWithMaintenanceSchedules interface { MaintenanceSchedules() (MaintenanceSchedules, error) } +type DatabaseGroupWithPoolWorkers interface { + PoolWorkers() (PoolWorkers, error) +} + type DatabaseGroup interface { DatabaseGroupWithOpenShiftClusters DatabaseGroupWithSubscriptions @@ -56,6 +60,7 @@ type DatabaseGroup interface { DatabaseGroupWithPortal DatabaseGroupWithMaintenanceManifests DatabaseGroupWithMaintenanceSchedules + DatabaseGroupWithPoolWorkers WithOpenShiftClusters(db OpenShiftClusters) DatabaseGroup WithSubscriptions(db Subscriptions) DatabaseGroup @@ -67,6 +72,7 @@ type DatabaseGroup interface { WithPortal(db Portal) DatabaseGroup WithMaintenanceManifests(db MaintenanceManifests) DatabaseGroup WithMaintenanceSchedules(db MaintenanceSchedules) DatabaseGroup + WithPoolWorkers(db PoolWorkers) DatabaseGroup } type dbGroup struct { @@ -80,6 +86,7 @@ type dbGroup struct { portal Portal maintenanceManifests MaintenanceManifests maintenanceSchedules MaintenanceSchedules + poolWorkers PoolWorkers } func (d *dbGroup) OpenShiftClusters() (OpenShiftClusters, error) { @@ -202,6 +209,18 @@ func (d *dbGroup) WithMaintenanceSchedules(db MaintenanceSchedules) DatabaseGrou return d } +func (d *dbGroup) PoolWorkers() (PoolWorkers, error) { + if d.poolWorkers == nil { + return nil, errors.New("no PoolWorkers database client set") + } + return d.poolWorkers, nil +} + +func (d *dbGroup) WithPoolWorkers(db PoolWorkers) DatabaseGroup { + d.poolWorkers = db + return d +} + func NewDBGroup() DatabaseGroup { return &dbGroup{} } diff --git a/pkg/database/poolworkers.go b/pkg/database/poolworkers.go new file mode 100644 index 00000000000..f98eb22d5a2 --- /dev/null +++ b/pkg/database/poolworkers.go @@ -0,0 +1,182 @@ +package database + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/database/cosmosdb" + "github.com/Azure/ARO-RP/pkg/util/uuid" +) + +const ( + PoolWorkerGetMasterQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id = "@workerType" AND doc.poolWorker.workerType = "@workerType" AND (doc.leaseExpires ?? 0) < GetCurrentTimestamp() / 1000` + PoolWorkerGetWorkersQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id != "@workerType" AND doc.poolWorker.workerType = "@workerType"` +) + +type poolWorkers struct { + c cosmosdb.PoolWorkerDocumentClient + uuid string +} + +// PoolWorkers is the database interface for PoolWorkerDocuments +type PoolWorkers interface { + Create(context.Context, api.PoolWorkerType, *api.PoolWorkerDocument) (*api.PoolWorkerDocument, error) + PatchWithLease(context.Context, api.PoolWorkerType, string, func(*api.PoolWorkerDocument) error) (*api.PoolWorkerDocument, error) + TryLease(context.Context, api.PoolWorkerType) (*api.PoolWorkerDocument, error) + ListBuckets(context.Context, api.PoolWorkerType) ([]int, error) + ListPoolWorkers(context.Context, api.PoolWorkerType) (*api.PoolWorkerDocuments, error) + PoolWorkerHeartbeat(context.Context, api.PoolWorkerType, int) error +} + +// NewPoolWorkers returns a new PoolWorkers +func NewPoolWorkers(ctx context.Context, dbc cosmosdb.DatabaseClient, dbName string) (PoolWorkers, error) { + collc := cosmosdb.NewCollectionClient(dbc, dbName) + + return &poolWorkers{ + c: cosmosdb.NewPoolWorkerDocumentClient(collc, collPoolWorkers), + uuid: uuid.DefaultGenerator.Generate(), + }, nil +} + +func NewPoolWorkersWithProvidedClient(client cosmosdb.PoolWorkerDocumentClient, uuid string) PoolWorkers { + return &poolWorkers{ + c: client, + uuid: uuid, + } +} + +func (c *poolWorkers) Create(ctx context.Context, poolWorkerType api.PoolWorkerType, doc *api.PoolWorkerDocument) (*api.PoolWorkerDocument, error) { + if doc.ID != strings.ToLower(doc.ID) { + return nil, fmt.Errorf("id %q is not lower case", doc.ID) + } + + doc, err := c.c.Create(ctx, string(poolWorkerType), doc, nil) + + if err, ok := err.(*cosmosdb.Error); ok && err.StatusCode == http.StatusConflict { + err.StatusCode = http.StatusPreconditionFailed + } + + return doc, err +} + +func (c *poolWorkers) get(ctx context.Context, poolWorkerType api.PoolWorkerType, id string) (*api.PoolWorkerDocument, error) { + if id != strings.ToLower(id) { + return nil, fmt.Errorf("id %q is not lower case", id) + } + + return c.c.Get(ctx, string(poolWorkerType), id, nil) +} + +func (c *poolWorkers) patch(ctx context.Context, poolWorkerType api.PoolWorkerType, id string, f func(*api.PoolWorkerDocument) error, options *cosmosdb.Options) (*api.PoolWorkerDocument, error) { + var doc *api.PoolWorkerDocument + + err := cosmosdb.RetryOnPreconditionFailed(func() (err error) { + doc, err = c.get(ctx, poolWorkerType, id) + if err != nil { + return + } + + err = f(doc) + if err != nil { + return + } + + doc, err = c.update(ctx, poolWorkerType, doc, options) + return + }) + + return doc, err +} + +func (c *poolWorkers) PatchWithLease(ctx context.Context, poolWorkerType api.PoolWorkerType, id string, f func(*api.PoolWorkerDocument) error) (*api.PoolWorkerDocument, error) { + return c.patch(ctx, poolWorkerType, id, func(doc *api.PoolWorkerDocument) error { + if doc.LeaseOwner != c.uuid { + return fmt.Errorf("lost lease") + } + + return f(doc) + }, &cosmosdb.Options{PreTriggers: []string{"renewLease"}}) +} + +func (c *poolWorkers) update(ctx context.Context, poolWorkerType api.PoolWorkerType, doc *api.PoolWorkerDocument, options *cosmosdb.Options) (*api.PoolWorkerDocument, error) { + if doc.ID != strings.ToLower(doc.ID) { + return nil, fmt.Errorf("id %q is not lower case", doc.ID) + } + + return c.c.Replace(ctx, string(poolWorkerType), doc, options) +} + +func (c *poolWorkers) TryLease(ctx context.Context, workerType api.PoolWorkerType) (*api.PoolWorkerDocument, error) { + docs, err := c.c.QueryAll(ctx, string(workerType), &cosmosdb.Query{ + Query: PoolWorkerGetMasterQuery, + Parameters: []cosmosdb.Parameter{ + { + Name: "@workerType", + Value: string(workerType), + }, + }, + }, nil) + if err != nil { + return nil, err + } + if docs == nil { + return nil, nil + } + + for _, doc := range docs.PoolWorkerDocuments { + doc.LeaseOwner = c.uuid + doc, err = c.update(ctx, workerType, doc, &cosmosdb.Options{PreTriggers: []string{"renewLease"}}) + if cosmosdb.IsErrorStatusCode(err, http.StatusPreconditionFailed) { // someone else got there first + continue + } + return doc, err + } + + return nil, nil +} + +func (c *poolWorkers) ListBuckets(ctx context.Context, poolWorkerType api.PoolWorkerType) (buckets []int, err error) { + doc, err := c.get(ctx, poolWorkerType, string(poolWorkerType)) + if err != nil || doc == nil || doc.PoolWorker == nil { + return nil, err + } + + for i, poolworker := range doc.PoolWorker.Buckets { + if poolworker == c.uuid { + buckets = append(buckets, i) + } + } + + return buckets, nil +} + +func (c *poolWorkers) ListPoolWorkers(ctx context.Context, poolWorkerType api.PoolWorkerType) (*api.PoolWorkerDocuments, error) { + return c.c.QueryAll(ctx, string(poolWorkerType), &cosmosdb.Query{ + Query: PoolWorkerGetWorkersQuery, + Parameters: []cosmosdb.Parameter{ + { + Name: "@workerType", + Value: string(poolWorkerType), + }, + }, + }, nil) +} + +func (c *poolWorkers) PoolWorkerHeartbeat(ctx context.Context, poolWorkerType api.PoolWorkerType, ttl int) error { + doc := &api.PoolWorkerDocument{ + ID: c.uuid, + WorkerType: poolWorkerType, + TTL: ttl, + } + _, err := c.update(ctx, poolWorkerType, doc, &cosmosdb.Options{NoETag: true}) + if err != nil && cosmosdb.IsErrorStatusCode(err, http.StatusNotFound) { + _, err = c.Create(ctx, poolWorkerType, doc) + } + return err +} diff --git a/test/database/inmemory.go b/test/database/inmemory.go index d2d26e8eaac..91fc7577b74 100644 --- a/test/database/inmemory.go +++ b/test/database/inmemory.go @@ -51,6 +51,17 @@ func NewFakeMonitorWithExistingClient(client *cosmosdb.FakeMonitorDocumentClient return database.NewMonitorsWithProvidedClient(client, uuid.DefaultGenerator.Generate()) } +func NewFakePoolWorkers(now func() time.Time) (db database.PoolWorkers, client *cosmosdb.FakePoolWorkerDocumentClient) { + client = cosmosdb.NewFakePoolWorkerDocumentClient(jsonHandle) + injectPoolWorkers(client, now) + db = database.NewPoolWorkersWithProvidedClient(client, uuid.DefaultGenerator.Generate()) + return db, client +} + +func FakePoolWorkerWithExistingClient(client *cosmosdb.FakePoolWorkerDocumentClient) database.PoolWorkers { + return database.NewPoolWorkersWithProvidedClient(client, uuid.DefaultGenerator.Generate()) +} + func NewFakeBilling() (db database.Billing, client *cosmosdb.FakeBillingDocumentClient) { client = cosmosdb.NewFakeBillingDocumentClient(jsonHandle) injectBilling(client) diff --git a/test/database/poolworkers.go b/test/database/poolworkers.go new file mode 100644 index 00000000000..0d9a84a23e6 --- /dev/null +++ b/test/database/poolworkers.go @@ -0,0 +1,82 @@ +package database + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "slices" + "time" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/database" + "github.com/Azure/ARO-RP/pkg/database/cosmosdb" +) + +func fakePoolWorkeringRenewLeaseTrigger(_ context.Context, doc *api.PoolWorkerDocument, now func() time.Time) error { + doc.LeaseExpires = int(now().Unix()) + 60 + return nil +} + +func fakePoolWorkerGetMasterQuery(client cosmosdb.PoolWorkerDocumentClient, q *cosmosdb.Query, opts *cosmosdb.Options, now func() time.Time) cosmosdb.PoolWorkerDocumentRawIterator { + input, err := client.ListAll(context.Background(), opts) + if err != nil { + // TODO: should this never happen? + panic(err) + } + + out := []*api.PoolWorkerDocument{} + for _, r := range input.PoolWorkerDocuments { + if r.ID != q.Parameters[0].Value { + continue + } + if string(r.WorkerType) != q.Parameters[0].Value { + continue + } + if time.Unix(int64(r.LeaseExpires), 0).After(now()) { + continue + } + out = append(out, r) + } + + return cosmosdb.NewFakePoolWorkerDocumentIterator(out, 0) +} + +func fakePoolWorkerGetAllButMasterHandler(client cosmosdb.PoolWorkerDocumentClient, q *cosmosdb.Query, opts *cosmosdb.Options, now func() time.Time) cosmosdb.PoolWorkerDocumentRawIterator { + input, err := client.ListAll(context.Background(), opts) + if err != nil { + // TODO: should this never happen? + panic(err) + } + if input == nil { + return cosmosdb.NewFakePoolWorkerDocumentIterator(nil, 0) + } + + out := []*api.PoolWorkerDocument{} + for _, r := range input.PoolWorkerDocuments { + if r.ID == q.Parameters[0].Value { + continue + } + if string(r.WorkerType) != q.Parameters[0].Value { + continue + } + // XXX: This does not test for TTL -- we need to add saving a Timestamp to gocosmosdb + out = append(out, r) + } + return cosmosdb.NewFakePoolWorkerDocumentIterator(out, 0) +} + +func injectPoolWorkers(c *cosmosdb.FakePoolWorkerDocumentClient, now func() time.Time) { + c.SetQueryHandler(database.PoolWorkerGetMasterQuery, func(client cosmosdb.PoolWorkerDocumentClient, query *cosmosdb.Query, opts *cosmosdb.Options) cosmosdb.PoolWorkerDocumentRawIterator { + return fakePoolWorkerGetMasterQuery(client, query, opts, now) + }) + c.SetQueryHandler(database.PoolWorkerGetWorkersQuery, func(client cosmosdb.PoolWorkerDocumentClient, query *cosmosdb.Query, opts *cosmosdb.Options) cosmosdb.PoolWorkerDocumentRawIterator { + return fakePoolWorkerGetAllButMasterHandler(client, query, opts, now) + }) + c.SetTriggerHandler("renewLease", func(ctx context.Context, doc *api.PoolWorkerDocument) error { + return fakePoolWorkeringRenewLeaseTrigger(ctx, doc, now) + }) + c.SetSorter(func(in []*api.PoolWorkerDocument) { + slices.SortFunc(in, func(a, b *api.PoolWorkerDocument) int { return CompareIDable(a, b) }) + }) +} From 453527012d7d3819b47af9add54949a64494f741 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 15:06:23 +1100 Subject: [PATCH 02/20] add poolworkers to the deploy --- pkg/deploy/assets/databases-development.json | 40 +++++++++++++++++++ pkg/deploy/assets/rp-production.json | 42 ++++++++++++++++++++ pkg/deploy/generator/resources_rp.go | 26 ++++++++++++ 3 files changed, 108 insertions(+) diff --git a/pkg/deploy/assets/databases-development.json b/pkg/deploy/assets/databases-development.json index 11a459672c7..8350bf2a355 100644 --- a/pkg/deploy/assets/databases-development.json +++ b/pkg/deploy/assets/databases-development.json @@ -157,6 +157,28 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" }, + { + "apiVersion": "2023-04-15", + "dependsOn": [ + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), parameters('databaseName'))]" + ], + "location": "[resourceGroup().location]", + "name": "[concat(parameters('databaseAccountName'), '/', parameters('databaseName'), '/PoolWorkers')]", + "properties": { + "options": {}, + "resource": { + "defaultTtl": -1, + "id": "PoolWorkers", + "partitionKey": { + "kind": "Hash", + "paths": [ + "/workerType" + ] + } + } + }, + "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" + }, { "apiVersion": "2023-04-15", "dependsOn": [ @@ -392,6 +414,24 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" }, + { + "apiVersion": "2023-04-15", + "dependsOn": [ + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), parameters('databaseName'))]", + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers', parameters('databaseAccountName'), parameters('databaseName'), 'PoolWorkers')]" + ], + "location": "[resourceGroup().location]", + "name": "[concat(parameters('databaseAccountName'), '/', parameters('databaseName'), '/PoolWorkers/renewLease')]", + "properties": { + "resource": { + "body": "function trigger() {\n\t\t\t\tvar request = getContext().getRequest();\n\t\t\t\tvar body = request.getBody();\n\t\t\t\tvar date = new Date();\n\t\t\t\tbody[\"leaseExpires\"] = Math.floor(date.getTime() / 1000) + 60;\n\t\t\t\trequest.setBody(body);\n\t\t\t}", + "id": "renewLease", + "triggerOperation": "All", + "triggerType": "Pre" + } + }, + "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" + }, { "apiVersion": "2023-04-15", "dependsOn": [ diff --git a/pkg/deploy/assets/rp-production.json b/pkg/deploy/assets/rp-production.json index 4634a495124..b10d259ffa1 100644 --- a/pkg/deploy/assets/rp-production.json +++ b/pkg/deploy/assets/rp-production.json @@ -888,6 +888,29 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" }, + { + "apiVersion": "2023-04-15", + "dependsOn": [ + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), 'ARO')]", + "[resourceId('Microsoft.DocumentDB/databaseAccounts', parameters('databaseAccountName'))]" + ], + "location": "[resourceGroup().location]", + "name": "[concat(parameters('databaseAccountName'), '/', 'ARO', '/PoolWorkers')]", + "properties": { + "options": {}, + "resource": { + "defaultTtl": -1, + "id": "PoolWorkers", + "partitionKey": { + "kind": "Hash", + "paths": [ + "/workerType" + ] + } + } + }, + "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" + }, { "apiVersion": "2023-04-15", "dependsOn": [ @@ -1140,6 +1163,25 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" }, + { + "apiVersion": "2023-04-15", + "dependsOn": [ + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), 'ARO')]", + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers', parameters('databaseAccountName'), 'ARO', 'PoolWorkers')]", + "[resourceId('Microsoft.DocumentDB/databaseAccounts', parameters('databaseAccountName'))]" + ], + "location": "[resourceGroup().location]", + "name": "[concat(parameters('databaseAccountName'), '/', 'ARO', '/PoolWorkers/renewLease')]", + "properties": { + "resource": { + "body": "function trigger() {\n\t\t\t\tvar request = getContext().getRequest();\n\t\t\t\tvar body = request.getBody();\n\t\t\t\tvar date = new Date();\n\t\t\t\tbody[\"leaseExpires\"] = Math.floor(date.getTime() / 1000) + 60;\n\t\t\t\trequest.setBody(body);\n\t\t\t}", + "id": "renewLease", + "triggerOperation": "All", + "triggerType": "Pre" + } + }, + "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" + }, { "apiVersion": "2023-04-15", "dependsOn": [ diff --git a/pkg/deploy/generator/resources_rp.go b/pkg/deploy/generator/resources_rp.go index beda18e8a5d..7f7380c4085 100644 --- a/pkg/deploy/generator/resources_rp.go +++ b/pkg/deploy/generator/resources_rp.go @@ -1006,6 +1006,30 @@ func (g *generator) database(databaseName string, addDependsOn bool) []*arm.Reso "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), " + databaseName + ")]", }, }, + { + Resource: &sdkcosmos.SQLContainerCreateUpdateParameters{ + Properties: &sdkcosmos.SQLContainerCreateUpdateProperties{ + Resource: &sdkcosmos.SQLContainerResource{ + ID: pointerutils.ToPtr("PoolWorkers"), + PartitionKey: &sdkcosmos.ContainerPartitionKey{ + Paths: []*string{ + pointerutils.ToPtr("/workerType"), + }, + Kind: &hashPartitionKey, + }, + DefaultTTL: pointerutils.ToPtr(int32(-1)), + }, + Options: &sdkcosmos.CreateUpdateOptions{}, + }, + Name: pointerutils.ToPtr("[concat(parameters('databaseAccountName'), '/', " + databaseName + ", '/PoolWorkers')]"), + Type: pointerutils.ToPtr("Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers"), + Location: pointerutils.ToPtr("[resourceGroup().location]"), + }, + APIVersion: azureclient.APIVersion("Microsoft.DocumentDB"), + DependsOn: []string{ + "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), " + databaseName + ")]", + }, + }, { Resource: &sdkcosmos.SQLContainerCreateUpdateParameters{ Properties: &sdkcosmos.SQLContainerCreateUpdateProperties{ @@ -1088,6 +1112,8 @@ func (g *generator) database(databaseName string, addDependsOn bool) []*arm.Reso g.rpCosmosDBTriggers(databaseName, "OpenShiftClusters", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), // Monitors g.rpCosmosDBTriggers(databaseName, "Monitors", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), + // PoolWorkers + g.rpCosmosDBTriggers(databaseName, "PoolWorkers", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), // MIMO DB triggers g.rpCosmosDBTriggers(databaseName, "MaintenanceManifests", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), ) From 64c11ce817950d17b4395252627ecb59c5be5f6b Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 15:14:21 +1100 Subject: [PATCH 03/20] monitor: move the monitor to use PoolWorker --- pkg/api/poolworker.go | 1 + pkg/monitor/master.go | 36 +++++++++--------- pkg/monitor/master_test.go | 74 ++++++++++++++++++------------------- pkg/monitor/monitor.go | 15 ++++---- pkg/monitor/test_helpers.go | 60 +++++++++++++++--------------- pkg/monitor/worker.go | 4 +- test/database/inmemory.go | 2 +- 7 files changed, 97 insertions(+), 95 deletions(-) diff --git a/pkg/api/poolworker.go b/pkg/api/poolworker.go index 57105f219b2..b37c9508667 100644 --- a/pkg/api/poolworker.go +++ b/pkg/api/poolworker.go @@ -6,6 +6,7 @@ package api type PoolWorkerType string const ( + PoolWorkerTypeMonitor PoolWorkerType = "monitor" PoolWorkerTypeMIMOActuator PoolWorkerType = "mimo-actuator" PoolWorkerTypeMIMOScheduler PoolWorkerType = "mimo-scheduler" ) diff --git a/pkg/monitor/master.go b/pkg/monitor/master.go index f3934003045..e3b865a395b 100644 --- a/pkg/monitor/master.go +++ b/pkg/monitor/master.go @@ -12,7 +12,7 @@ import ( // master updates the monitor document with the list of buckets balanced between // registered monitors func (mon *monitor) master(ctx context.Context) error { - dbMonitors, err := mon.dbGroup.Monitors() + dbPoolWorkers, err := mon.dbGroup.PoolWorkers() if err != nil { return err } @@ -20,7 +20,7 @@ func (mon *monitor) master(ctx context.Context) error { // if we know we're not the master, attempt to gain the lease on the monitor // document if !mon.isMaster { - doc, err := dbMonitors.TryLease(ctx) + doc, err := dbPoolWorkers.TryLease(ctx, api.PoolWorkerTypeMonitor) if err != nil || doc == nil { return err } @@ -36,16 +36,16 @@ func (mon *monitor) master(ctx context.Context) error { // including ourself, balance buckets between them and write the bucket // allocations to the database. If it turns out that we're not the master, // the patch will fail - _, err = dbMonitors.PatchWithLease(ctx, "master", func(doc *api.MonitorDocument) error { - docs, err := dbMonitors.ListMonitors(ctx) + _, err = dbPoolWorkers.PatchWithLease(ctx, api.PoolWorkerTypeMonitor, string(api.PoolWorkerTypeMonitor), func(doc *api.PoolWorkerDocument) error { + docs, err := dbPoolWorkers.ListPoolWorkers(ctx, api.PoolWorkerTypeMonitor) if err != nil { return err } var monitors []string if docs != nil { - monitors = make([]string, 0, len(docs.MonitorDocuments)) - for _, doc := range docs.MonitorDocuments { + monitors = make([]string, 0, len(docs.PoolWorkerDocuments)) + for _, doc := range docs.PoolWorkerDocuments { monitors = append(monitors, doc.ID) } } @@ -61,19 +61,19 @@ func (mon *monitor) master(ctx context.Context) error { } // balance shares out buckets over a slice of registered monitors -func (mon *monitor) balance(monitors []string, doc *api.MonitorDocument) { - // initialise doc.Monitor - if doc.Monitor == nil { - doc.Monitor = &api.Monitor{} +func (mon *monitor) balance(monitors []string, doc *api.PoolWorkerDocument) { + // initialise doc.PoolWorker + if doc.PoolWorker == nil { + doc.PoolWorker = &api.PoolWorker{} } - // ensure len(doc.Monitor.Buckets) == mon.bucketCount: this should only do + // ensure len(doc.PoolWorker.Buckets) == mon.bucketCount: this should only do // anything on the very first run - if len(doc.Monitor.Buckets) < mon.bucketCount { - doc.Monitor.Buckets = append(doc.Monitor.Buckets, make([]string, mon.bucketCount-len(doc.Monitor.Buckets))...) + if len(doc.PoolWorker.Buckets) < mon.bucketCount { + doc.PoolWorker.Buckets = append(doc.PoolWorker.Buckets, make([]string, mon.bucketCount-len(doc.PoolWorker.Buckets))...) } - if len(doc.Monitor.Buckets) > mon.bucketCount { // should never happen - doc.Monitor.Buckets = doc.Monitor.Buckets[:mon.bucketCount] + if len(doc.PoolWorker.Buckets) > mon.bucketCount { // should never happen + doc.PoolWorker.Buckets = doc.PoolWorker.Buckets[:mon.bucketCount] } var unallocated []int @@ -91,7 +91,7 @@ func (mon *monitor) balance(monitors []string, doc *api.MonitorDocument) { } // load the current bucket allocations into the map - for i, monitor := range doc.Monitor.Buckets { + for i, monitor := range doc.PoolWorker.Buckets { if buckets, found := m[monitor]; found && len(buckets) < target { // if the current bucket is allocated to a known monitor and doesn't // take its number of buckets above the target, keep it there... @@ -119,11 +119,11 @@ func (mon *monitor) balance(monitors []string, doc *api.MonitorDocument) { // write the updated bucket allocations back to the document for _, i := range unallocated { - doc.Monitor.Buckets[i] = "" // should only happen if there are no known monitors + doc.PoolWorker.Buckets[i] = "" // should only happen if there are no known monitors } for monitor, buckets := range m { for _, i := range buckets { - doc.Monitor.Buckets[i] = monitor + doc.PoolWorker.Buckets[i] = monitor } } } diff --git a/pkg/monitor/master_test.go b/pkg/monitor/master_test.go index a5bc29406c8..74799e7d84e 100644 --- a/pkg/monitor/master_test.go +++ b/pkg/monitor/master_test.go @@ -14,19 +14,19 @@ func TestBalance(t *testing.T) { type test struct { name string monitors []string - doc func() *api.MonitorDocument - validate func(*testing.T, *test, *api.MonitorDocument) + doc func() *api.PoolWorkerDocument + validate func(*testing.T, *test, *api.PoolWorkerDocument) } for _, tt := range []*test{ { name: "0->1", monitors: []string{"one"}, - doc: func() *api.MonitorDocument { - return &api.MonitorDocument{} + doc: func() *api.PoolWorkerDocument { + return &api.PoolWorkerDocument{} }, - validate: func(t *testing.T, tt *test, doc *api.MonitorDocument) { - for i, bucket := range doc.Monitor.Buckets { + validate: func(t *testing.T, tt *test, doc *api.PoolWorkerDocument) { + for i, bucket := range doc.PoolWorker.Buckets { if bucket != "one" { t.Error(i, bucket) } @@ -36,15 +36,15 @@ func TestBalance(t *testing.T) { { name: "3->1", monitors: []string{"one"}, - doc: func() *api.MonitorDocument { - return &api.MonitorDocument{ - Monitor: &api.Monitor{ + doc: func() *api.PoolWorkerDocument { + return &api.PoolWorkerDocument{ + PoolWorker: &api.PoolWorker{ Buckets: []string{"one", "two", "one", "three", "one", "two", "two", "one", "two"}, }, } }, - validate: func(t *testing.T, tt *test, doc *api.MonitorDocument) { - for i, bucket := range doc.Monitor.Buckets { + validate: func(t *testing.T, tt *test, doc *api.PoolWorkerDocument) { + for i, bucket := range doc.PoolWorker.Buckets { if bucket != "one" { t.Error(i, bucket) } @@ -53,15 +53,15 @@ func TestBalance(t *testing.T) { }, { name: "3->0", - doc: func() *api.MonitorDocument { - return &api.MonitorDocument{ - Monitor: &api.Monitor{ + doc: func() *api.PoolWorkerDocument { + return &api.PoolWorkerDocument{ + PoolWorker: &api.PoolWorker{ Buckets: []string{"one", "one", "one", "one", "one", "one", "two", "three"}, }, } }, - validate: func(t *testing.T, tt *test, doc *api.MonitorDocument) { - for i, bucket := range doc.Monitor.Buckets { + validate: func(t *testing.T, tt *test, doc *api.PoolWorkerDocument) { + for i, bucket := range doc.PoolWorker.Buckets { if bucket != "" { t.Error(i, bucket) } @@ -70,23 +70,23 @@ func TestBalance(t *testing.T) { }, { name: "imbalanced", - doc: func() *api.MonitorDocument { - return &api.MonitorDocument{ - Monitor: &api.Monitor{ + doc: func() *api.PoolWorkerDocument { + return &api.PoolWorkerDocument{ + PoolWorker: &api.PoolWorker{ Buckets: []string{"one", "one", "", "two", "one", "one", "one", "one"}, }, } }, monitors: []string{"one", "two"}, - validate: func(t *testing.T, tt *test, doc *api.MonitorDocument) { + validate: func(t *testing.T, tt *test, doc *api.PoolWorkerDocument) { old := tt.doc() m := map[string]int{} - for i, bucket := range doc.Monitor.Buckets { + for i, bucket := range doc.PoolWorker.Buckets { m[bucket]++ switch bucket { case "one": - if old.Monitor.Buckets[i] != bucket { + if old.PoolWorker.Buckets[i] != bucket { t.Error(i) } case "two": @@ -108,41 +108,41 @@ func TestBalance(t *testing.T) { }, { name: "stable", - doc: func() *api.MonitorDocument { - return &api.MonitorDocument{ - Monitor: &api.Monitor{ + doc: func() *api.PoolWorkerDocument { + return &api.PoolWorkerDocument{ + PoolWorker: &api.PoolWorker{ Buckets: []string{"one", "two", "three", "one", "two", "three", "one", "three"}, }, } }, monitors: []string{"one", "two", "three"}, - validate: func(t *testing.T, tt *test, doc *api.MonitorDocument) { + validate: func(t *testing.T, tt *test, doc *api.PoolWorkerDocument) { old := tt.doc() if !reflect.DeepEqual(old, doc) { - t.Error(doc.Monitor.Buckets) + t.Error(doc.PoolWorker.Buckets) } }, }, { name: "3->5", - doc: func() *api.MonitorDocument { - return &api.MonitorDocument{ - Monitor: &api.Monitor{ + doc: func() *api.PoolWorkerDocument { + return &api.PoolWorkerDocument{ + PoolWorker: &api.PoolWorker{ Buckets: []string{"one", "two", "three", "one", "two", "three", "one", "three"}, }, } }, monitors: []string{"one", "two", "three", "four", "five"}, - validate: func(t *testing.T, tt *test, doc *api.MonitorDocument) { + validate: func(t *testing.T, tt *test, doc *api.PoolWorkerDocument) { old := tt.doc() m := map[string]int{} - for i, bucket := range doc.Monitor.Buckets { + for i, bucket := range doc.PoolWorker.Buckets { m[bucket]++ switch bucket { case "one", "two", "three": - if old.Monitor.Buckets[i] != bucket { + if old.PoolWorker.Buckets[i] != bucket { t.Error(i) } case "four", "five": @@ -172,12 +172,12 @@ func TestBalance(t *testing.T) { mon.balance(tt.monitors, doc) - if doc.Monitor == nil { - t.Fatal(doc.Monitor) + if doc.PoolWorker == nil { + t.Fatal(doc.PoolWorker) } - if len(doc.Monitor.Buckets) != 8 { - t.Fatal(len(doc.Monitor.Buckets)) + if len(doc.PoolWorker.Buckets) != 8 { + t.Fatal(len(doc.PoolWorker.Buckets)) } tt.validate(t, tt, doc) diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go index 5f856f7e4ad..42cf83786ad 100644 --- a/pkg/monitor/monitor.go +++ b/pkg/monitor/monitor.go @@ -33,7 +33,7 @@ import ( ) type monitorDBs interface { - database.DatabaseGroupWithMonitors + database.DatabaseGroupWithPoolWorkers database.DatabaseGroupWithOpenShiftClusters database.DatabaseGroupWithSubscriptions } @@ -118,7 +118,7 @@ func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbGroup monitorDBs, m, c } func (mon *monitor) Run(ctx context.Context) error { - dbMonitors, err := mon.dbGroup.Monitors() + dbPoolWorkers, err := mon.dbGroup.PoolWorkers() if err != nil { return err } @@ -139,11 +139,12 @@ func (mon *monitor) Run(ctx context.Context) error { // dequeue it. If it already exists we will get a StatusPreconditionFailed // error, which is expected and we can ignore. The leasing of the master // document is in `mon.master()`. - _, err = dbMonitors.Create(ctx, &api.MonitorDocument{ - ID: "master", + _, err = dbPoolWorkers.Create(ctx, api.PoolWorkerTypeMonitor, &api.PoolWorkerDocument{ + ID: string(api.PoolWorkerTypeMonitor), + WorkerType: api.PoolWorkerTypeMonitor, }) if err != nil && !cosmosdb.IsErrorStatusCode(err, http.StatusPreconditionFailed) { - mon.baseLog.Error(fmt.Errorf("error bootstrapping master MonitorDocument (not a 412): %w", err)) + mon.baseLog.Error(fmt.Errorf("error bootstrapping master PoolWorkerDocument (not a 412): %w", err)) return err } @@ -161,9 +162,9 @@ func (mon *monitor) Run(ctx context.Context) error { for { // register ourself as a monitor, ttl of 60s default - err = dbMonitors.MonitorHeartbeat(ctx, int(mon.changefeedInterval.Seconds()*6)) + err = dbPoolWorkers.PoolWorkerHeartbeat(ctx, api.PoolWorkerTypeMonitor, int(mon.changefeedInterval.Seconds()*6)) if err != nil { - mon.baseLog.Error(fmt.Errorf("error registering ourselves as a monitor, continuing: %w", err)) + mon.baseLog.Error(fmt.Errorf("error registering ourselves as a Monitor poolWorker, continuing: %w", err)) } // try to become master and share buckets across registered monitors diff --git a/pkg/monitor/test_helpers.go b/pkg/monitor/test_helpers.go index 34b2b365111..d514a326698 100644 --- a/pkg/monitor/test_helpers.go +++ b/pkg/monitor/test_helpers.go @@ -35,19 +35,19 @@ var fakeClusterVisitMonitoringAttempts = xsync.NewMap[string, *atomic.Int64]() // TestEnvironment contains all the test setup components type TestEnvironment struct { - OpenShiftClusterDB database.OpenShiftClusters - SubscriptionsDB database.Subscriptions - MonitorsDB database.Monitors - OpenShiftClusterClient *cosmosdb.FakeOpenShiftClusterDocumentClient - SubscriptionsClient *cosmosdb.FakeSubscriptionDocumentClient - FakeMonitorsDBClient *cosmosdb.FakeMonitorDocumentClient - Controller *gomock.Controller - TestLogger *logrus.Entry - Dialer *mock_proxy.MockDialer - MockEnv *mock_env.MockInterface - NoopMetricsEmitter noop.Noop - NoopClusterMetrics noop.Noop - DBGroup monitorDBs + OpenShiftClusterDB database.OpenShiftClusters + SubscriptionsDB database.Subscriptions + PoolWorkersDB database.PoolWorkers + OpenShiftClusterClient *cosmosdb.FakeOpenShiftClusterDocumentClient + SubscriptionsClient *cosmosdb.FakeSubscriptionDocumentClient + FakePoolWorkersDBClient *cosmosdb.FakePoolWorkerDocumentClient + Controller *gomock.Controller + TestLogger *logrus.Entry + Dialer *mock_proxy.MockDialer + MockEnv *mock_env.MockInterface + NoopMetricsEmitter noop.Noop + NoopClusterMetrics noop.Noop + DBGroup monitorDBs } // SetupTestEnvironment creates a common test environment for monitor tests @@ -55,7 +55,7 @@ func SetupTestEnvironment(t *testing.T) *TestEnvironment { // Create databases openShiftClusterDB, openShiftClusterClient := testdatabase.NewFakeOpenShiftClusters() subscriptionsDB, subscriptionsClient := testdatabase.NewFakeSubscriptions() - monitorsDB, fakeMonitorsDBClient := testdatabase.NewFakeMonitors(time.Now) + poolWorkersDB, fakePoolMonitorsDBClient := testdatabase.NewFakePoolWorkers(time.Now) // Create mocks ctrl := gomock.NewController(t) @@ -71,7 +71,7 @@ func SetupTestEnvironment(t *testing.T) *TestEnvironment { // Create database group dbs := database.NewDBGroup(). - WithMonitors(monitorsDB). + WithPoolWorkers(poolWorkersDB). WithOpenShiftClusters(openShiftClusterDB). WithSubscriptions(subscriptionsDB) @@ -80,26 +80,26 @@ func SetupTestEnvironment(t *testing.T) *TestEnvironment { f.Create() return &TestEnvironment{ - OpenShiftClusterDB: openShiftClusterDB, - SubscriptionsDB: subscriptionsDB, - MonitorsDB: monitorsDB, - OpenShiftClusterClient: openShiftClusterClient, - SubscriptionsClient: subscriptionsClient, - FakeMonitorsDBClient: fakeMonitorsDBClient, - Controller: ctrl, - TestLogger: testlogger, - Dialer: dialer, - MockEnv: mockEnv, - NoopMetricsEmitter: noopMetricsEmitter, - NoopClusterMetrics: noopClusterMetricsEmitter, - DBGroup: dbs, + OpenShiftClusterDB: openShiftClusterDB, + SubscriptionsDB: subscriptionsDB, + PoolWorkersDB: poolWorkersDB, + OpenShiftClusterClient: openShiftClusterClient, + SubscriptionsClient: subscriptionsClient, + FakePoolWorkersDBClient: fakePoolMonitorsDBClient, + Controller: ctrl, + TestLogger: testlogger, + Dialer: dialer, + MockEnv: mockEnv, + NoopMetricsEmitter: noopMetricsEmitter, + NoopClusterMetrics: noopClusterMetricsEmitter, + DBGroup: dbs, } } // CreateTestMonitor creates a single monitor with test configuration func (env *TestEnvironment) CreateTestMonitor(loggerField string) *monitor { - uniqueMonitorsDB := testdatabase.NewFakeMonitorWithExistingClient(env.FakeMonitorsDBClient) - nDBs := database.NewDBGroup().WithMonitors(uniqueMonitorsDB). + uniquePoolWorkersDB := testdatabase.NewFakePoolWorkersWithExistingClient(env.FakePoolWorkersDBClient) + nDBs := database.NewDBGroup().WithPoolWorkers(uniquePoolWorkersDB). WithOpenShiftClusters(env.OpenShiftClusterDB). WithSubscriptions(env.SubscriptionsDB) diff --git a/pkg/monitor/worker.go b/pkg/monitor/worker.go index 0c32ec497a6..13a57d050a1 100644 --- a/pkg/monitor/worker.go +++ b/pkg/monitor/worker.go @@ -37,12 +37,12 @@ const changefeedBatchSize = 50 // listBuckets reads our bucket allocation from the master func (mon *monitor) listBuckets(ctx context.Context) error { - dbMonitors, err := mon.dbGroup.Monitors() + dbPoolWorkers, err := mon.dbGroup.PoolWorkers() if err != nil { return err } - buckets, err := dbMonitors.ListBuckets(ctx) + buckets, err := dbPoolWorkers.ListBuckets(ctx, api.PoolWorkerTypeMonitor) if err != nil { return err } diff --git a/test/database/inmemory.go b/test/database/inmemory.go index 91fc7577b74..262e86cf166 100644 --- a/test/database/inmemory.go +++ b/test/database/inmemory.go @@ -58,7 +58,7 @@ func NewFakePoolWorkers(now func() time.Time) (db database.PoolWorkers, client * return db, client } -func FakePoolWorkerWithExistingClient(client *cosmosdb.FakePoolWorkerDocumentClient) database.PoolWorkers { +func NewFakePoolWorkersWithExistingClient(client *cosmosdb.FakePoolWorkerDocumentClient) database.PoolWorkers { return database.NewPoolWorkersWithProvidedClient(client, uuid.DefaultGenerator.Generate()) } From 28e9c0a67a5489ddf4a4ea043ffda8ed0dc2ef0a Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 15:46:37 +1100 Subject: [PATCH 04/20] move the monitor bucketing logic to pkg/util/buckets --- pkg/monitor/master.go | 129 ------------- pkg/monitor/monitor.go | 28 +-- pkg/monitor/worker.go | 20 +- pkg/util/buckets/balancer.go | 173 ++++++++++++++++++ .../buckets/balancer_test.go} | 8 +- 5 files changed, 182 insertions(+), 176 deletions(-) delete mode 100644 pkg/monitor/master.go create mode 100644 pkg/util/buckets/balancer.go rename pkg/{monitor/master_test.go => util/buckets/balancer_test.go} (97%) diff --git a/pkg/monitor/master.go b/pkg/monitor/master.go deleted file mode 100644 index e3b865a395b..00000000000 --- a/pkg/monitor/master.go +++ /dev/null @@ -1,129 +0,0 @@ -package monitor - -// Copyright (c) Microsoft Corporation. -// Licensed under the Apache License 2.0. - -import ( - "context" - - "github.com/Azure/ARO-RP/pkg/api" -) - -// master updates the monitor document with the list of buckets balanced between -// registered monitors -func (mon *monitor) master(ctx context.Context) error { - dbPoolWorkers, err := mon.dbGroup.PoolWorkers() - if err != nil { - return err - } - - // if we know we're not the master, attempt to gain the lease on the monitor - // document - if !mon.isMaster { - doc, err := dbPoolWorkers.TryLease(ctx, api.PoolWorkerTypeMonitor) - if err != nil || doc == nil { - return err - } - mon.isMaster = true - } - - // we know we're not the master; give up - if !mon.isMaster { - return nil - } - - // we think we're the master. Gather up all the registered monitors - // including ourself, balance buckets between them and write the bucket - // allocations to the database. If it turns out that we're not the master, - // the patch will fail - _, err = dbPoolWorkers.PatchWithLease(ctx, api.PoolWorkerTypeMonitor, string(api.PoolWorkerTypeMonitor), func(doc *api.PoolWorkerDocument) error { - docs, err := dbPoolWorkers.ListPoolWorkers(ctx, api.PoolWorkerTypeMonitor) - if err != nil { - return err - } - - var monitors []string - if docs != nil { - monitors = make([]string, 0, len(docs.PoolWorkerDocuments)) - for _, doc := range docs.PoolWorkerDocuments { - monitors = append(monitors, doc.ID) - } - } - - mon.balance(monitors, doc) - - return nil - }) - if err != nil && err.Error() == "lost lease" { - mon.isMaster = false - } - return err -} - -// balance shares out buckets over a slice of registered monitors -func (mon *monitor) balance(monitors []string, doc *api.PoolWorkerDocument) { - // initialise doc.PoolWorker - if doc.PoolWorker == nil { - doc.PoolWorker = &api.PoolWorker{} - } - - // ensure len(doc.PoolWorker.Buckets) == mon.bucketCount: this should only do - // anything on the very first run - if len(doc.PoolWorker.Buckets) < mon.bucketCount { - doc.PoolWorker.Buckets = append(doc.PoolWorker.Buckets, make([]string, mon.bucketCount-len(doc.PoolWorker.Buckets))...) - } - if len(doc.PoolWorker.Buckets) > mon.bucketCount { // should never happen - doc.PoolWorker.Buckets = doc.PoolWorker.Buckets[:mon.bucketCount] - } - - var unallocated []int - m := make(map[string][]int, len(monitors)) // map of monitor to list of buckets it owns - for _, monitor := range monitors { - m[monitor] = nil - } - - var target int // target number of buckets per monitor - if len(monitors) > 0 { - target = mon.bucketCount / len(monitors) - if mon.bucketCount%len(monitors) != 0 { - target++ - } - } - - // load the current bucket allocations into the map - for i, monitor := range doc.PoolWorker.Buckets { - if buckets, found := m[monitor]; found && len(buckets) < target { - // if the current bucket is allocated to a known monitor and doesn't - // take its number of buckets above the target, keep it there... - m[monitor] = append(m[monitor], i) - } else { - // ...otherwise we'll reallocate it below - unallocated = append(unallocated, i) - } - } - - // reallocate all unallocated buckets, appending to the least loaded monitor - if len(monitors) > 0 { - for _, i := range unallocated { - var leastMonitor string - for monitor := range m { - if leastMonitor == "" || - len(m[monitor]) < len(m[leastMonitor]) { - leastMonitor = monitor - } - } - - m[leastMonitor] = append(m[leastMonitor], i) - } - } - - // write the updated bucket allocations back to the document - for _, i := range unallocated { - doc.PoolWorker.Buckets[i] = "" // should only happen if there are no known monitors - } - for monitor, buckets := range m { - for _, i := range buckets { - doc.PoolWorker.Buckets[i] = monitor - } - } -} diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go index 42cf83786ad..e2f5fd2f1bc 100644 --- a/pkg/monitor/monitor.go +++ b/pkg/monitor/monitor.go @@ -28,6 +28,7 @@ import ( "github.com/Azure/ARO-RP/pkg/monitor/monitoring" "github.com/Azure/ARO-RP/pkg/proxy" "github.com/Azure/ARO-RP/pkg/util/bucket" + "github.com/Azure/ARO-RP/pkg/util/buckets" "github.com/Azure/ARO-RP/pkg/util/changefeed" "github.com/Azure/ARO-RP/pkg/util/heartbeat" ) @@ -160,32 +161,7 @@ func (mon *monitor) Run(ctx context.Context) error { go heartbeat.EmitHeartbeat(mon.baseLog, mon.m, "monitor.heartbeat", nil, mon.checkReady) - for { - // register ourself as a monitor, ttl of 60s default - err = dbPoolWorkers.PoolWorkerHeartbeat(ctx, api.PoolWorkerTypeMonitor, int(mon.changefeedInterval.Seconds()*6)) - if err != nil { - mon.baseLog.Error(fmt.Errorf("error registering ourselves as a Monitor poolWorker, continuing: %w", err)) - } - - // try to become master and share buckets across registered monitors - err = mon.master(ctx) - if err != nil { - mon.baseLog.Error(fmt.Errorf("error registering ourselves as the master: %w", err)) - } - - // read our bucket allocation from the master - err = mon.listBuckets(ctx) - if err != nil { - mon.baseLog.Error(fmt.Errorf("error reading bucket allocation from master: %w", err)) - } else { - mon.lastBucketlist.Store(time.Now()) - } - - if err = ctx.Err(); err != nil { - return err - } - <-t.C - } + return buckets.StartBucketWorkerLoop(ctx, mon.baseLog, api.PoolWorkerTypeMonitor, mon.bucketCount, mon.changefeedInterval, dbPoolWorkers, mon.onBuckets) } func (mon *monitor) startChangefeeds(ctx context.Context, stop <-chan struct{}) error { diff --git a/pkg/monitor/worker.go b/pkg/monitor/worker.go index 13a57d050a1..505b74e3862 100644 --- a/pkg/monitor/worker.go +++ b/pkg/monitor/worker.go @@ -6,7 +6,6 @@ package monitor import ( "context" "errors" - "fmt" "reflect" "strings" "sync" @@ -35,20 +34,11 @@ var subscriptionStateLogFrequency = 30 * time.Minute // changefeedBatchSize is how many items in the changefeed to fetch in each page const changefeedBatchSize = 50 -// listBuckets reads our bucket allocation from the master -func (mon *monitor) listBuckets(ctx context.Context) error { - dbPoolWorkers, err := mon.dbGroup.PoolWorkers() - if err != nil { - return err - } - - buckets, err := dbPoolWorkers.ListBuckets(ctx, api.PoolWorkerTypeMonitor) - if err != nil { - return err - } - +// onBuckets is called when we fetch our bucket allocation from the master +func (mon *monitor) onBuckets(buckets []int) { if len(buckets) == 0 { - return fmt.Errorf("bucket allocation contained no buckets") + mon.baseLog.Error("bucket allocation contained no buckets") + return } mon.mu.Lock() @@ -66,7 +56,7 @@ func (mon *monitor) listBuckets(ctx context.Context) error { mon.fixDocs() } - return err + mon.lastBucketlist.Store(time.Now()) } type clusterChangeFeedResponder struct { diff --git a/pkg/util/buckets/balancer.go b/pkg/util/buckets/balancer.go new file mode 100644 index 00000000000..24dddae54f3 --- /dev/null +++ b/pkg/util/buckets/balancer.go @@ -0,0 +1,173 @@ +package buckets + +import ( + "context" + "fmt" + "time" + + "github.com/sirupsen/logrus" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/database" +) + +func StartBucketWorkerLoop( + ctx context.Context, + log *logrus.Entry, + workerType api.PoolWorkerType, + bucketCount int, + interval time.Duration, + dbPoolWorkers database.PoolWorkers, + onBucketChange func([]int), +) error { + t := time.NewTicker(interval) + defer t.Stop() + + isMaster := false + for { + // register ourself as a monitor, ttl of 60s default + err := dbPoolWorkers.PoolWorkerHeartbeat(ctx, workerType, int(interval.Seconds()*6)) + if err != nil { + log.Error(fmt.Errorf("error registering ourselves as a %s poolWorker, continuing: %w", workerType, err)) + } + + isMaster, err = tryMaster(ctx, log, workerType, bucketCount, dbPoolWorkers, isMaster) + if err != nil { + log.Error(fmt.Errorf("error registering ourselves as the master, continuing: %w", err)) + } + + buckets, err := dbPoolWorkers.ListBuckets(ctx, workerType) + if err != nil { + log.Error(fmt.Errorf("error reading bucket allocation from master: %w", err)) + } else { + onBucketChange(buckets) + } + + if err = ctx.Err(); err != nil { + return err + } + <-t.C + } +} + +// master updates the PoolWorkerDocument with the list of buckets balanced between +// registered workers +func tryMaster( + ctx context.Context, + log *logrus.Entry, + workerType api.PoolWorkerType, + bucketCount int, + dbPoolWorkers database.PoolWorkers, + isMaster bool, +) (bool, error) { + // if we know we're not the master, attempt to gain the lease on the + // PoolWorkerDocument + if !isMaster { + doc, err := dbPoolWorkers.TryLease(ctx, workerType) + if err != nil || doc == nil { + return false, err + } + isMaster = true + log.Infof("became the %s master", workerType) + } + + // we know we're not the master; give up + if !isMaster { + return false, nil + } + + // we think we're the master. Gather up all the registered workers + // including ourself, balance buckets between them and write the bucket + // allocations to the database. If it turns out that we're not the master, + // the patch will fail + _, err := dbPoolWorkers.PatchWithLease(ctx, workerType, string(workerType), func(doc *api.PoolWorkerDocument) error { + docs, err := dbPoolWorkers.ListPoolWorkers(ctx, workerType) + if err != nil { + return err + } + + var workers []string + if docs != nil { + workers = make([]string, 0, len(docs.PoolWorkerDocuments)) + for _, doc := range docs.PoolWorkerDocuments { + workers = append(workers, doc.ID) + } + } + + balance(workers, bucketCount, doc) + return nil + }) + if err != nil && err.Error() == "lost lease" { + isMaster = false + log.Infof("stopped being the %s master", workerType) + } + return isMaster, err +} + +// balance shares out buckets over a slice of registered workers +func balance(workers []string, bucketCount int, doc *api.PoolWorkerDocument) { + // initialise doc.PoolWorker + if doc.PoolWorker == nil { + doc.PoolWorker = &api.PoolWorker{} + } + + // ensure len(doc.PoolWorker.Buckets) == mon.bucketCount: this should only do + // anything on the very first run + if len(doc.PoolWorker.Buckets) < bucketCount { + doc.PoolWorker.Buckets = append(doc.PoolWorker.Buckets, make([]string, bucketCount-len(doc.PoolWorker.Buckets))...) + } + if len(doc.PoolWorker.Buckets) > bucketCount { // should never happen + doc.PoolWorker.Buckets = doc.PoolWorker.Buckets[:bucketCount] + } + + var unallocated []int + m := make(map[string][]int, len(workers)) // map of worker to list of buckets it owns + for _, monitor := range workers { + m[monitor] = nil + } + + var target int // target number of buckets per worker + if len(workers) > 0 { + target = bucketCount / len(workers) + if bucketCount%len(workers) != 0 { + target++ + } + } + + // load the current bucket allocations into the map + for i, worker := range doc.PoolWorker.Buckets { + if buckets, found := m[worker]; found && len(buckets) < target { + // if the current bucket is allocated to a known worker and doesn't + // take its number of buckets above the target, keep it there... + m[worker] = append(m[worker], i) + } else { + // ...otherwise we'll reallocate it below + unallocated = append(unallocated, i) + } + } + + // reallocate all unallocated buckets, appending to the least loaded monitor + if len(workers) > 0 { + for _, i := range unallocated { + var leastWorker string + for worker := range m { + if leastWorker == "" || + len(m[worker]) < len(m[leastWorker]) { + leastWorker = worker + } + } + + m[leastWorker] = append(m[leastWorker], i) + } + } + + // write the updated bucket allocations back to the document + for _, i := range unallocated { + doc.PoolWorker.Buckets[i] = "" // should only happen if there are no known workers + } + for worker, buckets := range m { + for _, i := range buckets { + doc.PoolWorker.Buckets[i] = worker + } + } +} diff --git a/pkg/monitor/master_test.go b/pkg/util/buckets/balancer_test.go similarity index 97% rename from pkg/monitor/master_test.go rename to pkg/util/buckets/balancer_test.go index 74799e7d84e..c78a6fc3c4e 100644 --- a/pkg/monitor/master_test.go +++ b/pkg/util/buckets/balancer_test.go @@ -1,4 +1,4 @@ -package monitor +package buckets // Copyright (c) Microsoft Corporation. // Licensed under the Apache License 2.0. @@ -164,13 +164,9 @@ func TestBalance(t *testing.T) { }, } { t.Run(tt.name, func(t *testing.T) { - mon := &monitor{ - bucketCount: 8, - } - doc := tt.doc() - mon.balance(tt.monitors, doc) + balance(tt.monitors, 8, doc) if doc.PoolWorker == nil { t.Fatal(doc.PoolWorker) From 143d8b52ac47d2f6ab54c2cca9af7c8f1bba8ac2 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 15:46:45 +1100 Subject: [PATCH 05/20] update cmd/aro --- cmd/aro/monitor.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/aro/monitor.go b/cmd/aro/monitor.go index 1d40b23c47f..3b147326784 100644 --- a/cmd/aro/monitor.go +++ b/cmd/aro/monitor.go @@ -75,7 +75,7 @@ func monitor(ctx context.Context, _log *logrus.Entry) error { return err } - dbMonitors, err := database.NewMonitors(ctx, dbc, dbName) + dbPoolWorkers, err := database.NewPoolWorkers(ctx, dbc, dbName) if err != nil { return err } @@ -92,7 +92,7 @@ func monitor(ctx context.Context, _log *logrus.Entry) error { dbg := database.NewDBGroup().WithOpenShiftClusters(dbOpenShiftClusters). WithSubscriptions(dbSubscriptions). - WithMonitors(dbMonitors) + WithPoolWorkers(dbPoolWorkers) dialer, err := proxy.NewDialer(_env.IsLocalDevelopmentMode(), _env.LoggerForComponent("dialer")) if err != nil { From d96acca630ac04394d8c2b6bacd47c9566766a58 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 16:07:10 +1100 Subject: [PATCH 06/20] actuator: port to use buckets --- pkg/mimo/actuator/service.go | 87 ++++++++++--------------------- pkg/mimo/actuator/service_test.go | 87 ++----------------------------- pkg/monitor/monitor.go | 2 +- pkg/util/buckets/balancer.go | 8 ++- pkg/util/buckets/buckets.go | 11 ++-- 5 files changed, 46 insertions(+), 149 deletions(-) diff --git a/pkg/mimo/actuator/service.go b/pkg/mimo/actuator/service.go index df24161bfcb..cee12bfe011 100644 --- a/pkg/mimo/actuator/service.go +++ b/pkg/mimo/actuator/service.go @@ -10,7 +10,6 @@ import ( "math/rand" "net" "net/http" - "strconv" "strings" "sync" "sync/atomic" @@ -51,15 +50,18 @@ type service struct { workers *atomic.Int32 workerRoutines sync.WaitGroup - b buckets.BucketWorker[*api.OpenShiftClusterDocument] + b buckets.BucketWorker[*api.OpenShiftClusterDocument] + bucketCount int changefeedBatchSize int changefeedInterval time.Duration changefeedReadinessInterval time.Duration taskPollTime time.Duration + bucketRefreshInterval time.Duration - lastChangefeed atomic.Value // time.Time - startTime time.Time + lastChangefeed atomic.Value // time.Time + lastBucketUpdate atomic.Value // time.Time + startTime time.Time now func() time.Time workerDelay func() time.Duration @@ -74,9 +76,10 @@ var _ Runnable = (*service)(nil) type actuatorDBs interface { database.DatabaseGroupWithOpenShiftClusters database.DatabaseGroupWithMaintenanceManifests + database.DatabaseGroupWithPoolWorkers } -func NewService(env env.Interface, log *logrus.Entry, dialer proxy.Dialer, dbg actuatorDBs, m metrics.Emitter, ownedBuckets []int) *service { +func NewService(env env.Interface, log *logrus.Entry, dialer proxy.Dialer, dbg actuatorDBs, m metrics.Emitter) *service { s := &service{ env: env, baseLog: log, @@ -84,9 +87,10 @@ func NewService(env env.Interface, log *logrus.Entry, dialer proxy.Dialer, dbg a dbGroup: dbg, - m: m, - stopping: &atomic.Bool{}, - workers: &atomic.Int32{}, + m: m, + stopping: &atomic.Bool{}, + workers: &atomic.Int32{}, + bucketCount: bucket.Buckets, startTime: time.Now(), workerDelay: func() time.Duration { return time.Duration(rand.Intn(60)) * time.Second }, @@ -106,13 +110,14 @@ func NewService(env env.Interface, log *logrus.Entry, dialer proxy.Dialer, dbg a // prioritise responsiveness taskPollTime: 90 * time.Second, + // Bucket timing is set lower to prioritise responsiveness to VM changes + bucketRefreshInterval: 30 * time.Second, + serveHealthz: true, } s.cond = sync.NewCond(&s.mu) s.b = buckets.NewBucketWorker[*api.OpenShiftClusterDocument](log, s.spawnWorker, &s.mu) - s.b.SetBuckets(ownedBuckets) - return s } @@ -123,6 +128,11 @@ func (s *service) SetMaintenanceTasks(tasks map[api.MIMOTaskID]tasks.Maintenance func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- struct{}) error { defer recover.Panic(s.baseLog) + dbPoolWorkers, err := s.dbGroup.PoolWorkers() + if err != nil { + return err + } + // Only enable the healthz endpoint if configured (disabled in unit tests) if s.serveHealthz { c := &healthz.Handler{ @@ -175,6 +185,13 @@ func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- str } go heartbeat.EmitHeartbeat(s.baseLog, s.m, "actuator.heartbeat", nil, s.checkReady) + // Start the bucket worker update loop which will coordinate buckets between + // the MIMO instances + go buckets.StartBucketWorkerLoop( + ctx, s.baseLog, api.PoolWorkerTypeMIMOActuator, + s.bucketCount, s.bucketRefreshInterval, dbPoolWorkers, s.b.SetBuckets, stop, + ) + lastGotDocs := make(map[string]*api.OpenShiftClusterDocument) for !s.stopping.Load() { old, err := s.poll(ctx, lastGotDocs) @@ -349,53 +366,3 @@ out: } } } - -// DetermineBuckets uses the hostname to figure out which subset of buckets we -// should be serving. -func DetermineBuckets(env env.Core, hostnameFunc func() (string, error)) []int { - _log := env.Logger() - - // We have a VMSS with 3 VMs in prod - vmCount := 3 - - b := []int{} - if !env.IsLocalDevelopmentMode() { - name, err := hostnameFunc() - if err != nil { - // if we can't get the hostname then just run all of them - _log.Warn("unable to get the hostname for bucket determination") - } else { - // figure out which VMSS host we're running on - e.g. rp-v20000101.01-000001" - splitName := strings.Split(name, "-") - if len(splitName) > 1 { - num, err := strconv.Atoi(splitName[len(splitName)-1]) - if err != nil { - _log.Warningf("hostname %s doesn't end in a number, unable to partition buckets", name) - } else { - if num >= vmCount { - // Rather than guess, we fall back to all buckets. This - // means that a VMSS replacement of -3 might have some - // weird behaviour, but because we get a lock on the - // OpenShiftClusterObject before we do anything to the - // cluster, it should be fine. - _log.Warningf("vmss number is %d, currently only handles 3 partitions (vm numbers 0-2), falling back to all", num) - } else { - // For the 3 VMs, VM 1 will serve buckets 0,3,6..., - // VM 2 will serve 1,4,7... VM 3 will serve 2,5,8... - for i := num; i < bucket.Buckets; i += vmCount { - b = append(b, i) - } - } - } - } - } - } - - // We haven't figured out our buckets so fall back to all - if len(b) == 0 { - for i := range 256 { - b = append(b, i) - } - } - return b -} diff --git a/pkg/mimo/actuator/service_test.go b/pkg/mimo/actuator/service_test.go index b80890370dd..f005c5aae11 100644 --- a/pkg/mimo/actuator/service_test.go +++ b/pkg/mimo/actuator/service_test.go @@ -5,9 +5,7 @@ package actuator import ( "context" - "errors" "fmt" - "slices" "strings" "sync" "testing" @@ -173,7 +171,7 @@ func TestActuatorPolling(t *testing.T) { err := fixtures.WithOpenShiftClusters(clusters).WithSubscriptions(subscriptions).WithMaintenanceManifests(manifests).Create() require.NoError(err) - svc := NewService(_env, log, nil, dbs, metrics, []int{1}) + svc := NewService(_env, log, nil, dbs, metrics) svc.now = now svc.workerDelay = func() time.Duration { return 0 * time.Second } svc.serveHealthz = false @@ -254,10 +252,11 @@ var _ = Describe("MIMO Actuator Service", Ordered, func() { clusters, _ = testdatabase.NewFakeOpenShiftClusters() dbg := database.NewDBGroup().WithMaintenanceManifests(manifests).WithOpenShiftClusters(clusters) - svc = NewService(_env, log, nil, dbg, m, []int{1}) + svc = NewService(_env, log, nil, dbg, m) svc.now = now svc.workerDelay = func() time.Duration { return 0 * time.Second } svc.serveHealthz = false + svc.b.SetBuckets([]int{1}) }) JustBeforeEach(func() { @@ -424,83 +423,3 @@ var _ = Describe("MIMO Actuator Service", Ordered, func() { }) }) }) - -var _ = Describe("MIMO Bucket Partitioning", Ordered, func() { - var controller *gomock.Controller - var _env *mock_env.MockInterface - var log *logrus.Entry - - BeforeAll(func() { - log = logrus.NewEntry(&logrus.Logger{ - Out: GinkgoWriter, - Formatter: new(logrus.TextFormatter), - Hooks: make(logrus.LevelHooks), - Level: logrus.DebugLevel, - }) - - controller = gomock.NewController(nil) - _env = mock_env.NewMockInterface(controller) - - _env.EXPECT().Logger().Return(log).AnyTimes() - }) - - It("serves all buckets with 3 workers", func() { - _env.EXPECT().IsLocalDevelopmentMode().Return(false).Times(3) - - b1 := DetermineBuckets(_env, func() (string, error) { return "vm-00", nil }) - b2 := DetermineBuckets(_env, func() (string, error) { return "vm-01", nil }) - b3 := DetermineBuckets(_env, func() (string, error) { return "vm-02", nil }) - - all := slices.Concat(b1, b2, b3) - - Expect(all).To(HaveLen(256)) - for i := range 256 { - Expect(all).To(ContainElement(i)) - } - }) - - It("will serve all buckets if it cannot get the hostname", func() { - _env.EXPECT().IsLocalDevelopmentMode().Return(false) - b1 := DetermineBuckets(_env, func() (string, error) { return "", errors.New("boo") }) - - for i := range 256 { - Expect(b1).To(ContainElement(i)) - } - }) - - It("will serve all buckets if it does not understand the hostname", func() { - _env.EXPECT().IsLocalDevelopmentMode().Return(false) - b1 := DetermineBuckets(_env, func() (string, error) { return "foobar", nil }) - - for i := range 256 { - Expect(b1).To(ContainElement(i)) - } - }) - - It("will serve all buckets if the hostname does not end in a number", func() { - _env.EXPECT().IsLocalDevelopmentMode().Return(false) - b1 := DetermineBuckets(_env, func() (string, error) { return "vm-bar", nil }) - - for i := range 256 { - Expect(b1).To(ContainElement(i)) - } - }) - - It("will serve all buckets if the hostname ending in a number that is not 0-2", func() { - _env.EXPECT().IsLocalDevelopmentMode().Return(false) - b1 := DetermineBuckets(_env, func() (string, error) { return "vm-03", nil }) - - for i := range 256 { - Expect(b1).To(ContainElement(i)) - } - }) - - It("will serve all buckets in local dev", func() { - _env.EXPECT().IsLocalDevelopmentMode().Return(true) - b1 := DetermineBuckets(_env, func() (string, error) { return "vm-01", nil }) - - for i := range 256 { - Expect(b1).To(ContainElement(i)) - } - }) -}) diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go index e2f5fd2f1bc..db5962fb423 100644 --- a/pkg/monitor/monitor.go +++ b/pkg/monitor/monitor.go @@ -161,7 +161,7 @@ func (mon *monitor) Run(ctx context.Context) error { go heartbeat.EmitHeartbeat(mon.baseLog, mon.m, "monitor.heartbeat", nil, mon.checkReady) - return buckets.StartBucketWorkerLoop(ctx, mon.baseLog, api.PoolWorkerTypeMonitor, mon.bucketCount, mon.changefeedInterval, dbPoolWorkers, mon.onBuckets) + return buckets.StartBucketWorkerLoop(ctx, mon.baseLog, api.PoolWorkerTypeMonitor, mon.bucketCount, mon.changefeedInterval, dbPoolWorkers, mon.onBuckets, nil) } func (mon *monitor) startChangefeeds(ctx context.Context, stop <-chan struct{}) error { diff --git a/pkg/util/buckets/balancer.go b/pkg/util/buckets/balancer.go index 24dddae54f3..2b39e781062 100644 --- a/pkg/util/buckets/balancer.go +++ b/pkg/util/buckets/balancer.go @@ -19,6 +19,7 @@ func StartBucketWorkerLoop( interval time.Duration, dbPoolWorkers database.PoolWorkers, onBucketChange func([]int), + stop <-chan struct{}, ) error { t := time.NewTicker(interval) defer t.Stop() @@ -46,7 +47,12 @@ func StartBucketWorkerLoop( if err = ctx.Err(); err != nil { return err } - <-t.C + + select { + case <-t.C: + case <-stop: + return nil + } } } diff --git a/pkg/util/buckets/buckets.go b/pkg/util/buckets/buckets.go index d2890ad8d70..49249f251df 100644 --- a/pkg/util/buckets/buckets.go +++ b/pkg/util/buckets/buckets.go @@ -4,6 +4,7 @@ package buckets // Licensed under the Apache License 2.0. import ( + "reflect" "strings" "sync" @@ -76,13 +77,17 @@ func (mon *monitor[E]) Doc(id string) (r E, ok bool) { func (mon *monitor[E]) SetBuckets(buckets []int) { mon.mu.Lock() defer mon.mu.Unlock() - mon.buckets = map[int]struct{}{} + oldBuckets := mon.buckets + mon.buckets = make(map[int]struct{}, len(buckets)) for _, i := range buckets { mon.buckets[i] = struct{}{} } - for _, v := range mon.docs { - mon.FixDoc(v.doc) + if !reflect.DeepEqual(mon.buckets, oldBuckets) { + mon.baseLog.Printf("servicing %d buckets", len(mon.buckets)) + for _, v := range mon.docs { + mon.FixDoc(v.doc) + } } } From d07edc5fced4b2ee2a09da2b13b494db8a54ce7a Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 16:13:44 +1100 Subject: [PATCH 07/20] add bucket refreshing to the readiness delay --- pkg/mimo/actuator/service.go | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/pkg/mimo/actuator/service.go b/pkg/mimo/actuator/service.go index cee12bfe011..2d9b72cb3a4 100644 --- a/pkg/mimo/actuator/service.go +++ b/pkg/mimo/actuator/service.go @@ -53,11 +53,12 @@ type service struct { b buckets.BucketWorker[*api.OpenShiftClusterDocument] bucketCount int - changefeedBatchSize int - changefeedInterval time.Duration - changefeedReadinessInterval time.Duration - taskPollTime time.Duration - bucketRefreshInterval time.Duration + changefeedBatchSize int + changefeedInterval time.Duration + changefeedReadinessInterval time.Duration + taskPollTime time.Duration + bucketRefreshInterval time.Duration + bucketRefreshReadinessInterval time.Duration lastChangefeed atomic.Value // time.Time lastBucketUpdate atomic.Value // time.Time @@ -65,6 +66,7 @@ type service struct { now func() time.Time workerDelay func() time.Duration + readyDelay time.Duration tasks map[api.MIMOTaskID]tasks.MaintenanceTask @@ -111,8 +113,10 @@ func NewService(env env.Interface, log *logrus.Entry, dialer proxy.Dialer, dbg a taskPollTime: 90 * time.Second, // Bucket timing is set lower to prioritise responsiveness to VM changes - bucketRefreshInterval: 30 * time.Second, + bucketRefreshInterval: 30 * time.Second, + bucketRefreshReadinessInterval: 45 * time.Second, + readyDelay: time.Minute * 2, serveHealthz: true, } @@ -189,7 +193,10 @@ func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- str // the MIMO instances go buckets.StartBucketWorkerLoop( ctx, s.baseLog, api.PoolWorkerTypeMIMOActuator, - s.bucketCount, s.bucketRefreshInterval, dbPoolWorkers, s.b.SetBuckets, stop, + s.bucketCount, s.bucketRefreshInterval, dbPoolWorkers, func(i []int) { + s.b.SetBuckets(i) + s.lastBucketUpdate.Store(s.now()) + }, stop, ) lastGotDocs := make(map[string]*api.OpenShiftClusterDocument) @@ -286,17 +293,18 @@ func (s *service) waitForWorkerCompletion() { } func (s *service) checkReady() bool { + lastBucketUpdate, ok := s.lastBucketUpdate.Load().(time.Time) + if !ok { + return false + } lastChangefeedTime, ok := s.lastChangefeed.Load().(time.Time) if !ok { return false } - if s.env.IsLocalDevelopmentMode() { - return (time.Since(lastChangefeedTime) < s.changefeedReadinessInterval) // did we update our list of clusters recently? - } else { - return (time.Since(lastChangefeedTime) < s.changefeedReadinessInterval) && // did we update our list of clusters recently? - (time.Since(s.startTime) > 2*time.Minute) // are we running for at least 2 minutes? - } + return (time.Since(lastBucketUpdate) < s.bucketRefreshReadinessInterval) && // did we list buckets successfully recently? + (time.Since(lastChangefeedTime) < s.changefeedReadinessInterval) && // did we update our list of clusters recently? + (time.Since(s.startTime) > s.readyDelay) // are we running for at least (the default) 2 minutes? } func (s *service) spawnWorker(stop <-chan struct{}, id string) { From fe0dec379144cfcd0379ab01208ebb32f0729a68 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 16:49:47 +1100 Subject: [PATCH 08/20] update the mimo scheduler to use the common bucketing method --- pkg/api/mimodocument.go | 2 +- pkg/mimo/scheduler/clustercache.go | 13 +--- pkg/mimo/scheduler/manager_test.go | 47 ++++-------- pkg/mimo/scheduler/selectors.go | 2 + pkg/mimo/scheduler/service.go | 111 ++++++++++++++++++++++------- pkg/mimo/scheduler/service_test.go | 6 +- pkg/util/buckets/cache.go | 10 ++- 7 files changed, 112 insertions(+), 79 deletions(-) diff --git a/pkg/api/mimodocument.go b/pkg/api/mimodocument.go index a8da000521b..16304ed1fe3 100644 --- a/pkg/api/mimodocument.go +++ b/pkg/api/mimodocument.go @@ -99,5 +99,5 @@ func (c *MaintenanceScheduleDocument) GetKey() string { } func (c *MaintenanceScheduleDocument) GetBucket() int { - return 0 + return -1 } diff --git a/pkg/mimo/scheduler/clustercache.go b/pkg/mimo/scheduler/clustercache.go index 1ab1f1223a9..b19a616e5b3 100644 --- a/pkg/mimo/scheduler/clustercache.go +++ b/pkg/mimo/scheduler/clustercache.go @@ -7,7 +7,6 @@ import ( "fmt" "iter" "reflect" - "slices" "strings" "sync" "sync/atomic" @@ -30,12 +29,11 @@ type openShiftClusterCache struct { subCache changefeed.SubscriptionsCache clusters *xsync.Map[string, selectorData] - ownedBuckets []int lastChangefeed atomic.Value // time.Time initialPopulationWaitGroup *sync.WaitGroup } -func newOpenShiftClusterCache(log *logrus.Entry, m metrics.Emitter, subCache changefeed.SubscriptionsCache, ownedBuckets []int) *openShiftClusterCache { +func newOpenShiftClusterCache(log *logrus.Entry, m metrics.Emitter, subCache changefeed.SubscriptionsCache) *openShiftClusterCache { wg := &sync.WaitGroup{} wg.Add(1) return &openShiftClusterCache{ @@ -43,7 +41,6 @@ func newOpenShiftClusterCache(log *logrus.Entry, m metrics.Emitter, subCache cha m: m, subCache: subCache, clusters: xsync.NewMap[string, selectorData](), - ownedBuckets: ownedBuckets, initialPopulationWaitGroup: wg, } } @@ -65,14 +62,6 @@ func (c *openShiftClusterCache) OnDoc(doc *api.OpenShiftClusterDocument) { ps := doc.OpenShiftCluster.Properties.ProvisioningState fps := doc.OpenShiftCluster.Properties.FailedProvisioningState - // If we don't own the bucket, delete it (which will be a no-op) just in - // case our buckets changed (even though MIMO doesn't support balancing that - // right now) - if !slices.Contains(c.ownedBuckets, doc.Bucket) { - c.clusters.Delete(id) - return - } - switch { case ps == api.ProvisioningStateCreating, ps == api.ProvisioningStateDeleting, diff --git a/pkg/mimo/scheduler/manager_test.go b/pkg/mimo/scheduler/manager_test.go index 226b2fa4c99..5103e8d75fd 100644 --- a/pkg/mimo/scheduler/manager_test.go +++ b/pkg/mimo/scheduler/manager_test.go @@ -19,7 +19,6 @@ import ( "github.com/Azure/ARO-RP/pkg/database" "github.com/Azure/ARO-RP/pkg/mimo/tasks" "github.com/Azure/ARO-RP/pkg/monitor/dimension" - "github.com/Azure/ARO-RP/pkg/util/changefeed" "github.com/Azure/ARO-RP/pkg/util/mimo" mock_env "github.com/Azure/ARO-RP/pkg/util/mocks/env" testdatabase "github.com/Azure/ARO-RP/test/database" @@ -38,7 +37,6 @@ func TestProcessLoop(t *testing.T) { mockSubID := "00000000-0000-0000-0000-000000000000" mockTenantID := "00001111-0000-0000-0000-000000000000" clusterResourceID := fmt.Sprintf("/subscriptions/%s/resourcegroups/resourceGroup/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName", mockSubID) - clusterResourceID2 := fmt.Sprintf("/subscriptions/%s/resourcegroups/resourceGroup/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName2", mockSubID) base_logs := []testlog.ExpectedLogEntry{ { @@ -937,20 +935,25 @@ func TestProcessLoop(t *testing.T) { clusters, _ := testdatabase.NewFakeOpenShiftClusters() subscriptions, _ := testdatabase.NewFakeSubscriptions() - dbs := database.NewDBGroup().WithMaintenanceSchedules(schedules).WithOpenShiftClusters(clusters).WithMaintenanceManifests(manifests) + dbs := database.NewDBGroup(). + WithMaintenanceSchedules(schedules). + WithOpenShiftClusters(clusters). + WithMaintenanceManifests(manifests). + WithSubscriptions(subscriptions) - subsCache := changefeed.NewSubscriptionsChangefeedCache(metrics, false) - clusterCache := newOpenShiftClusterCache(log, metrics, subsCache, []int{1}) stop := make(chan struct{}) t.Cleanup(func() { close(stop) }) + serv := NewService(_env, log, dbs, metrics) + serv.changefeedInterval = 10 * time.Millisecond + a := &scheduler{ log: log, env: _env, m: metrics, dbs: dbs, - getClusters: clusterCache.GetClusters, + getClusters: serv.clusters.GetClusters, tasks: map[api.MIMOTaskID]tasks.MaintenanceTask{}, now: now, @@ -983,20 +986,6 @@ func TestProcessLoop(t *testing.T) { }, }) - // Add a cluster that does not meet our bucket requirements and so - // won't cause any Manifests to be created - fixtures.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{ - Key: strings.ToLower(clusterResourceID2), - Bucket: 2, - OpenShiftCluster: &api.OpenShiftCluster{ - ID: clusterResourceID2, - Properties: api.OpenShiftClusterProperties{ - ProvisioningState: api.ProvisioningStateSucceeded, - MaintenanceState: api.MaintenanceStateNone, - }, - }, - }) - // Add the schedule + any existing manifests to the fixture fixtures.AddMaintenanceScheduleDocuments(tt.schedule) fixtures.AddMaintenanceManifestDocuments(tt.existingManifests...) @@ -1015,24 +1004,12 @@ func TestProcessLoop(t *testing.T) { checker.AddMaintenanceScheduleDocuments(tt.schedule) } - // fire up the changefeeds - go changefeed.RunChangefeed( - ctx, log.WithField("component", "subchangefeed"), subscriptions.ChangeFeed(), - 10*time.Millisecond, - 10, subsCache, stop, - ) - - // start cluster changefeed - go changefeed.RunChangefeed( - ctx, log.WithField("component", "clusterchangefeed"), clusters.ChangeFeed(), - 10*time.Millisecond, - 10, clusterCache, stop, - ) + err = serv.startChangefeeds(ctx, stop) + require.NoError(err) + serv.clusters.initialPopulationWaitGroup.Wait() a.cachedDoc = func() (*api.MaintenanceScheduleDocument, bool) { return tt.schedule, true } - clusterCache.initialPopulationWaitGroup.Wait() - for i := range tt.extraRuns + 1 { didWork, err := a.Process(ctx) require.NoError(err, "during run", i+1) diff --git a/pkg/mimo/scheduler/selectors.go b/pkg/mimo/scheduler/selectors.go index 8586399816c..5985dbf70bb 100644 --- a/pkg/mimo/scheduler/selectors.go +++ b/pkg/mimo/scheduler/selectors.go @@ -28,6 +28,7 @@ type SelectorDataKey string const ( SelectorDataKeyResourceID SelectorDataKey = "resourceID" + SelectorDataBucketID SelectorDataKey = "bucketID" SelectorDataKeySubscriptionID SelectorDataKey = "subscriptionID" SelectorDataKeySubscriptionState SelectorDataKey = "subscriptionState" SelectorDataKeyAuthenticationType SelectorDataKey = "authenticationType" @@ -107,6 +108,7 @@ func ToSelectorData(doc *api.OpenShiftClusterDocument, subscriptionState string) return nil, err } + new[SelectorDataBucketID] = fmt.Sprintf("%d", doc.Bucket) new[SelectorDataKeyResourceID] = resourceID new[SelectorDataKeySubscriptionID] = r.SubscriptionID new[SelectorDataKeySubscriptionState] = subscriptionState diff --git a/pkg/mimo/scheduler/service.go b/pkg/mimo/scheduler/service.go index fcd55c0285d..5d9c9dd872b 100644 --- a/pkg/mimo/scheduler/service.go +++ b/pkg/mimo/scheduler/service.go @@ -6,6 +6,8 @@ package scheduler import ( "context" "errors" + "fmt" + "iter" "log" "math/rand" "net" @@ -24,6 +26,7 @@ import ( "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/metrics" "github.com/Azure/ARO-RP/pkg/mimo/tasks" + "github.com/Azure/ARO-RP/pkg/util/bucket" "github.com/Azure/ARO-RP/pkg/util/buckets" "github.com/Azure/ARO-RP/pkg/util/changefeed" "github.com/Azure/ARO-RP/pkg/util/heartbeat" @@ -47,19 +50,25 @@ type service struct { workerRoutines sync.WaitGroup newScheduler newSchedulerFunc + buckets atomic.Value // []int b buckets.BucketWorker[*api.MaintenanceScheduleDocument] subs changefeed.SubscriptionsCache clusters *openShiftClusterCache - changefeedBatchSize int - changefeedInterval time.Duration + bucketCount int + changefeedBatchSize int + changefeedInterval time.Duration + bucketRefreshInterval time.Duration + bucketRefreshReadinessInterval time.Duration - lastChangefeed atomic.Value // time.Time - startTime time.Time + lastChangefeed atomic.Value // time.Time + lastBucketUpdate atomic.Value // time.Time + startTime time.Time pollTime time.Duration now func() time.Time workerDelay func() time.Duration + readyDelay time.Duration tasks map[api.MIMOTaskID]tasks.MaintenanceTask @@ -73,18 +82,20 @@ type schedulerDBs interface { database.DatabaseGroupWithSubscriptions database.DatabaseGroupWithMaintenanceManifests database.DatabaseGroupWithMaintenanceSchedules + database.DatabaseGroupWithPoolWorkers } -func NewService(env env.Interface, log *logrus.Entry, dbg schedulerDBs, m metrics.Emitter, ownedBuckets []int) *service { +func NewService(env env.Interface, log *logrus.Entry, dbg schedulerDBs, m metrics.Emitter) *service { s := &service{ env: env, baseLog: log, dbGroup: dbg, - m: m, - stopping: &atomic.Bool{}, - workers: &atomic.Int32{}, + m: m, + stopping: &atomic.Bool{}, + workers: &atomic.Int32{}, + bucketCount: bucket.Buckets, startTime: time.Now(), workerDelay: func() time.Duration { return time.Duration(rand.Intn(60)) * time.Second }, @@ -95,16 +106,18 @@ func NewService(env env.Interface, log *logrus.Entry, dbg schedulerDBs, m metric changefeedBatchSize: 50, changefeedInterval: 10 * time.Second, + // Bucket timing is set to prioritise responsiveness to VM changes + bucketRefreshInterval: 30 * time.Second, + bucketRefreshReadinessInterval: 45 * time.Second, + subs: changefeed.NewSubscriptionsChangefeedCache(m, false), + readyDelay: time.Minute * 2, serveHealthz: true, } - s.clusters = newOpenShiftClusterCache(log, m, s.subs, ownedBuckets) + s.clusters = newOpenShiftClusterCache(log, m, s.subs) s.b = buckets.NewBucketWorker[*api.MaintenanceScheduleDocument](log, s.spawnWorker, &s.mu) - // All Schedules have a bucket of 0 - s.b.SetBuckets([]int{0}) - return s } @@ -115,6 +128,11 @@ func (s *service) SetMaintenanceTasks(tasks map[api.MIMOTaskID]tasks.Maintenance func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- struct{}) error { defer recover.Panic(s.baseLog) + dbPoolWorkers, err := s.dbGroup.PoolWorkers() + if err != nil { + return err + } + // Only enable the healthz endpoint if configured (disabled in unit tests) if s.serveHealthz { c := &healthz.Handler{ @@ -165,13 +183,23 @@ func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- str }() } - err := s.startChangefeeds(ctx, stop) + err = s.startChangefeeds(ctx, stop) if err != nil { return err } go heartbeat.EmitHeartbeat(s.baseLog, s.m, "scheduler.heartbeat", nil, s.checkReady) + // Start the bucket worker update loop which will coordinate buckets between + // the MIMO instances + go buckets.StartBucketWorkerLoop( + ctx, s.baseLog, api.PoolWorkerTypeMIMOScheduler, + s.bucketCount, s.bucketRefreshInterval, dbPoolWorkers, func(i []int) { + s.buckets.Store(i) + s.lastBucketUpdate.Store(s.now()) + }, stop, + ) + lastGotDocs := make(map[string]*api.MaintenanceScheduleDocument) for !s.stopping.Load() { old, err := s.poll(ctx, lastGotDocs) @@ -232,7 +260,7 @@ func (s *service) poll(ctx context.Context, oldDocs map[string]*api.MaintenanceS return nil, err } - // Fetch all of the cluster UUIDs + // Fetch all of the valid schedules i, err := dbMaintenanceSchedules.GetValid(ctx, "") if err != nil { return nil, err @@ -290,6 +318,11 @@ func (s *service) poll(ctx context.Context, oldDocs map[string]*api.MaintenanceS } func (s *service) checkReady() bool { + lastBucketUpdate, ok := s.lastBucketUpdate.Load().(time.Time) + if !ok { + return false + } + lastChangefeedTime, ok := s.lastChangefeed.Load().(time.Time) if !ok { return false @@ -305,24 +338,21 @@ func (s *service) checkReady() bool { return false } - if s.env.IsLocalDevelopmentMode() { - return (time.Since(lastChangefeedTime) < time.Minute && // did we update our changefeeds recently? - time.Since(lastClusterChangefeed) < time.Minute && - time.Since(lastSubsChangefeed) < time.Minute) - } else { - return (time.Since(lastChangefeedTime) < time.Minute) && // did we update our list of clusters recently? - (time.Since(s.startTime) > 2*time.Minute) // are we running for at least 2 minutes? - } + return (time.Since(lastChangefeedTime) < time.Minute && // did we update our changefeeds recently? + time.Since(lastClusterChangefeed) < time.Minute && + time.Since(lastSubsChangefeed) < time.Minute) && + time.Since(lastBucketUpdate) < s.bucketRefreshReadinessInterval && + (time.Since(s.startTime) > s.readyDelay) // are we running for at least (the default) 2 minutes? } func (s *service) spawnWorker(stop <-chan struct{}, id string) { - s.workerRoutines.Add(1) - go s.worker(stop, id) + s.workerRoutines.Go(func() { + s.worker(stop, id) + }) } func (s *service) worker(stop <-chan struct{}, id string) { defer recover.Panic(s.baseLog) - defer s.workerRoutines.Done() delay := s.workerDelay() log := s.baseLog.WithFields(logrus.Fields{"scheduleID": id}) @@ -332,8 +362,37 @@ func (s *service) worker(stop <-chan struct{}, id string) { time.Sleep(delay) getDoc := func() (*api.MaintenanceScheduleDocument, bool) { return s.b.Doc(id) } + getClusters := func() iter.Seq2[string, selectorData] { + return func(yield func(string, selectorData) bool) { + _ownedBuckets, ok := s.buckets.Load().([]int) + if !ok { + // no owned buckets yet + return + } + + ownedBuckets := make(map[string]struct{}) + for i := range _ownedBuckets { + ownedBuckets[fmt.Sprintf("%d", i)] = struct{}{} + } + + // Only give clusters belonging to buckets we currently have owned + for cl, d := range s.clusters.GetClusters() { + bucket, ok := d.GetString(string(SelectorDataBucketID)) + if !ok { + continue + } + + _, ownedBucket := ownedBuckets[bucket] + if ownedBucket { + if !yield(cl, d) { + return + } + } + } + } + } - a, err := s.newScheduler(s.env, log, s.m, getDoc, s.clusters.GetClusters, s.dbGroup, s.now) + a, err := s.newScheduler(s.env, log, s.m, getDoc, getClusters, s.dbGroup, s.now) if err != nil { log.Error(err) return diff --git a/pkg/mimo/scheduler/service_test.go b/pkg/mimo/scheduler/service_test.go index 8713f5527ad..1dc6452c5b4 100644 --- a/pkg/mimo/scheduler/service_test.go +++ b/pkg/mimo/scheduler/service_test.go @@ -133,7 +133,7 @@ func TestSchedulerPolling(t *testing.T) { err := fixtures.WithOpenShiftClusters(clusters).WithSubscriptions(subscriptions).WithMaintenanceManifests(manifests).WithMaintenanceSchedules(schedules).Create() require.NoError(err) - svc := NewService(_env, log, dbs, metrics, []int{0}) + svc := NewService(_env, log, dbs, metrics) svc.now = now svc.workerDelay = func() time.Duration { return 0 * time.Second } svc.serveHealthz = false @@ -197,7 +197,7 @@ func TestSchedulerStoppingWholeProcess(t *testing.T) { waitFor := &sync.WaitGroup{} sched := &fakeScheduler{waitOnProcess: waitFor} - svc := NewService(_env, log, dbs, m, []int{0}) + svc := NewService(_env, log, dbs, m) svc.workerDelay = func() time.Duration { return 0 * time.Second } svc.pollTime = 1 * time.Millisecond svc.newScheduler = func(_ env.Interface, _ *logrus.Entry, _ metrics.Emitter, _ getCachedScheduleDocFunc, _ getClustersFunc, _ schedulerDBs, _ func() time.Time) (Scheduler, error) { @@ -259,7 +259,7 @@ func TestSchedulerStoppingSingleItem(t *testing.T) { waitFor := &sync.WaitGroup{} sched := &fakeScheduler{waitOnProcess: waitFor} - svc := NewService(_env, log, dbs, m, []int{0}) + svc := NewService(_env, log, dbs, m) svc.workerDelay = func() time.Duration { return 0 * time.Second } svc.pollTime = 1 * time.Millisecond svc.newScheduler = func(_ env.Interface, _ *logrus.Entry, _ metrics.Emitter, _ getCachedScheduleDocFunc, _ getClustersFunc, _ schedulerDBs, _ func() time.Time) (Scheduler, error) { diff --git a/pkg/util/buckets/cache.go b/pkg/util/buckets/cache.go index 04abc5f0262..be50d1d59c6 100644 --- a/pkg/util/buckets/cache.go +++ b/pkg/util/buckets/cache.go @@ -45,12 +45,18 @@ func (mon *monitor[E]) UpsertDoc(doc E) { } // fixDoc ensures that there is a monitoring goroutine for the given document -// iff it is in a bucket owned by us. Caller must hold mon.mu.Lock. +// if it is in a bucket owned by us. Caller must hold mon.mu.Lock. func (mon *monitor[E]) FixDoc(doc E) { id := strings.ToLower(doc.GetID()) v := mon.docs[id] - _, ours := mon.buckets[v.doc.GetBucket()] + var ours bool + // getBucket() with -1 is served by all + if v.doc.GetBucket() > -1 { + _, ours = mon.buckets[v.doc.GetBucket()] + } else { + ours = true + } if !ours && v.stop != nil { mon.baseLog.Debugf("we no longer own cluster, closing worker for %s", doc.GetID()) From 885ae0bcbacaa98cea2edc1894dc9cd3d3030f4d Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 16:53:30 +1100 Subject: [PATCH 09/20] use these dbs in the actuator/scheduler cmd --- cmd/aro/mimoactuator.go | 13 ++++++++----- cmd/aro/mimoscheduler.go | 14 ++++++++------ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cmd/aro/mimoactuator.go b/cmd/aro/mimoactuator.go index 489022a22ae..46e62cdadcb 100644 --- a/cmd/aro/mimoactuator.go +++ b/cmd/aro/mimoactuator.go @@ -77,9 +77,15 @@ func mimoActuator(ctx context.Context, _log *logrus.Entry) error { return err } + poolWorkers, err := database.NewPoolWorkers(ctx, dbc, dbName) + if err != nil { + return err + } + dbg := database.NewDBGroup(). WithOpenShiftClusters(clusters). - WithMaintenanceManifests(manifests) + WithMaintenanceManifests(manifests). + WithPoolWorkers(poolWorkers) go database.EmitMIMOMetrics(ctx, _env.LoggerForComponent("metrics"), manifests, m) @@ -88,10 +94,7 @@ func mimoActuator(ctx context.Context, _log *logrus.Entry) error { return err } - buckets := actuator.DetermineBuckets(_env, os.Hostname) - log.Printf("serving %d buckets: %v", len(buckets), buckets) - - a := actuator.NewService(_env, log, dialer, dbg, m, buckets) + a := actuator.NewService(_env, log, dialer, dbg, m) a.SetMaintenanceTasks(tasks.DEFAULT_MAINTENANCE_TASKS) sigterm := make(chan os.Signal, 1) diff --git a/cmd/aro/mimoscheduler.go b/cmd/aro/mimoscheduler.go index 2680e8c7e91..ab8daa3799c 100644 --- a/cmd/aro/mimoscheduler.go +++ b/cmd/aro/mimoscheduler.go @@ -15,7 +15,6 @@ import ( "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/metrics/statsd" "github.com/Azure/ARO-RP/pkg/metrics/statsd/golang" - "github.com/Azure/ARO-RP/pkg/mimo/actuator" "github.com/Azure/ARO-RP/pkg/mimo/scheduler" "github.com/Azure/ARO-RP/pkg/mimo/tasks" "github.com/Azure/ARO-RP/pkg/util/encryption" @@ -87,16 +86,19 @@ func mimoScheduler(ctx context.Context, _log *logrus.Entry) error { return err } + poolWorkers, err := database.NewPoolWorkers(ctx, dbc, dbName) + if err != nil { + return err + } + dbg := database.NewDBGroup(). WithOpenShiftClusters(clusters). WithSubscriptions(subscriptions). WithMaintenanceManifests(manifests). - WithMaintenanceSchedules(schedules) - - buckets := actuator.DetermineBuckets(_env, os.Hostname) - log.Printf("serving %d buckets: %v", len(buckets), buckets) + WithMaintenanceSchedules(schedules). + WithPoolWorkers(poolWorkers) - a := scheduler.NewService(_env, log, dbg, m, buckets) + a := scheduler.NewService(_env, log, dbg, m) a.SetMaintenanceTasks(tasks.DEFAULT_MAINTENANCE_TASKS) sigterm := make(chan os.Signal, 1) From 6582ee30336532b25399ade436218f2d1b70a060 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 16:53:40 +1100 Subject: [PATCH 10/20] unneeded code from the monitor --- pkg/monitor/monitor.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go index db5962fb423..124117b4cd3 100644 --- a/pkg/monitor/monitor.go +++ b/pkg/monitor/monitor.go @@ -61,7 +61,6 @@ type monitor struct { subs changefeed.SubscriptionsCache env env.Interface - isMaster bool bucketCount int buckets map[int]struct{} From 637aea39c32baf55f45efec71966ceeba6e93fe0 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 17:03:32 +1100 Subject: [PATCH 11/20] remove old monitor code --- pkg/api/monitor.go | 11 - pkg/api/monitordocument.go | 37 -- pkg/database/cosmosdb/generate.go | 2 +- .../cosmosdb/zz_generated_monitordocument.go | 313 -------------- .../zz_generated_monitordocument_fake.go | 389 ------------------ pkg/database/database.go | 1 - pkg/database/dbgroup.go | 19 - pkg/database/monitors.go | 169 -------- pkg/deploy/generator/resources_rp.go | 26 -- test/database/inmemory.go | 11 - test/database/monitors.go | 76 ---- 11 files changed, 1 insertion(+), 1053 deletions(-) delete mode 100644 pkg/api/monitor.go delete mode 100644 pkg/api/monitordocument.go delete mode 100644 pkg/database/cosmosdb/zz_generated_monitordocument.go delete mode 100644 pkg/database/cosmosdb/zz_generated_monitordocument_fake.go delete mode 100644 pkg/database/monitors.go delete mode 100644 test/database/monitors.go diff --git a/pkg/api/monitor.go b/pkg/api/monitor.go deleted file mode 100644 index 16d65d03a00..00000000000 --- a/pkg/api/monitor.go +++ /dev/null @@ -1,11 +0,0 @@ -package api - -// Copyright (c) Microsoft Corporation. -// Licensed under the Apache License 2.0. - -// Monitor represents a monitor -type Monitor struct { - MissingFields - - Buckets []string `json:"buckets,omitempty"` -} diff --git a/pkg/api/monitordocument.go b/pkg/api/monitordocument.go deleted file mode 100644 index cae544aadc1..00000000000 --- a/pkg/api/monitordocument.go +++ /dev/null @@ -1,37 +0,0 @@ -package api - -// Copyright (c) Microsoft Corporation. -// Licensed under the Apache License 2.0. - -// MonitorDocuments represents monitor documents. -// pkg/database/cosmosdb requires its definition. -type MonitorDocuments struct { - Count int `json:"_count,omitempty"` - ResourceID string `json:"_rid,omitempty"` - MonitorDocuments []*MonitorDocument `json:"Documents,omitempty"` -} - -// MonitorDocument represents a monitor document. -// pkg/database/cosmosdb requires its definition. -type MonitorDocument struct { - MissingFields - - ID string `json:"id,omitempty"` - ResourceID string `json:"_rid,omitempty"` - Timestamp int `json:"_ts,omitempty"` - Self string `json:"_self,omitempty"` - ETag string `json:"_etag,omitempty" deep:"-"` - Attachments string `json:"_attachments,omitempty"` - TTL int `json:"ttl,omitempty"` - LSN int `json:"_lsn,omitempty"` - Metadata map[string]interface{} `json:"_metadata,omitempty"` - - LeaseOwner string `json:"leaseOwner,omitempty"` - LeaseExpires int `json:"leaseExpires,omitempty"` - - Monitor *Monitor `json:"monitor,omitempty"` -} - -func (c *MonitorDocument) GetID() string { - return c.ID -} diff --git a/pkg/database/cosmosdb/generate.go b/pkg/database/cosmosdb/generate.go index ae95676e63b..83a2eb4724c 100644 --- a/pkg/database/cosmosdb/generate.go +++ b/pkg/database/cosmosdb/generate.go @@ -3,5 +3,5 @@ package cosmosdb // Copyright (c) Microsoft Corporation. // Licensed under the Apache License 2.0. -//go:generate gencosmosdb github.com/Azure/ARO-RP/pkg/api,AsyncOperationDocument github.com/Azure/ARO-RP/pkg/api,BillingDocument github.com/Azure/ARO-RP/pkg/api,GatewayDocument github.com/Azure/ARO-RP/pkg/api,MonitorDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftClusterDocument github.com/Azure/ARO-RP/pkg/api,SubscriptionDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftVersionDocument github.com/Azure/ARO-RP/pkg/api,PlatformWorkloadIdentityRoleSetDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceManifestDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceScheduleDocument github.com/Azure/ARO-RP/pkg/api,PoolWorkerDocument +//go:generate gencosmosdb github.com/Azure/ARO-RP/pkg/api,AsyncOperationDocument github.com/Azure/ARO-RP/pkg/api,BillingDocument github.com/Azure/ARO-RP/pkg/api,GatewayDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftClusterDocument github.com/Azure/ARO-RP/pkg/api,SubscriptionDocument github.com/Azure/ARO-RP/pkg/api,OpenShiftVersionDocument github.com/Azure/ARO-RP/pkg/api,PlatformWorkloadIdentityRoleSetDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceManifestDocument github.com/Azure/ARO-RP/pkg/api,MaintenanceScheduleDocument github.com/Azure/ARO-RP/pkg/api,PoolWorkerDocument //go:generate mockgen -destination=../../util/mocks/$GOPACKAGE/$GOPACKAGE.go github.com/Azure/ARO-RP/pkg/database/$GOPACKAGE PermissionClient diff --git a/pkg/database/cosmosdb/zz_generated_monitordocument.go b/pkg/database/cosmosdb/zz_generated_monitordocument.go deleted file mode 100644 index 07176ffab63..00000000000 --- a/pkg/database/cosmosdb/zz_generated_monitordocument.go +++ /dev/null @@ -1,313 +0,0 @@ -// Code generated by github.com/bennerv/go-cosmosdb, DO NOT EDIT. - -package cosmosdb - -import ( - "context" - "net/http" - "strconv" - "strings" - - pkg "github.com/Azure/ARO-RP/pkg/api" -) - -type monitorDocumentClient struct { - *databaseClient - path string -} - -// MonitorDocumentClient is a monitorDocument client -type MonitorDocumentClient interface { - Create(context.Context, string, *pkg.MonitorDocument, *Options) (*pkg.MonitorDocument, error) - List(*Options) MonitorDocumentIterator - ListAll(context.Context, *Options) (*pkg.MonitorDocuments, error) - Get(context.Context, string, string, *Options) (*pkg.MonitorDocument, error) - Replace(context.Context, string, *pkg.MonitorDocument, *Options) (*pkg.MonitorDocument, error) - Delete(context.Context, string, *pkg.MonitorDocument, *Options) error - Query(string, *Query, *Options) MonitorDocumentRawIterator - QueryAll(context.Context, string, *Query, *Options) (*pkg.MonitorDocuments, error) - ChangeFeed(*Options) MonitorDocumentIterator -} - -type monitorDocumentChangeFeedIterator struct { - *monitorDocumentClient - continuation string - options *Options -} - -type monitorDocumentListIterator struct { - *monitorDocumentClient - continuation string - done bool - options *Options -} - -type monitorDocumentQueryIterator struct { - *monitorDocumentClient - partitionkey string - query *Query - continuation string - done bool - options *Options -} - -// MonitorDocumentIterator is a monitorDocument iterator -type MonitorDocumentIterator interface { - Next(context.Context, int) (*pkg.MonitorDocuments, error) - Continuation() string -} - -// MonitorDocumentRawIterator is a monitorDocument raw iterator -type MonitorDocumentRawIterator interface { - MonitorDocumentIterator - NextRaw(context.Context, int, interface{}) error -} - -// NewMonitorDocumentClient returns a new monitorDocument client -func NewMonitorDocumentClient(collc CollectionClient, collid string) MonitorDocumentClient { - return &monitorDocumentClient{ - databaseClient: collc.(*collectionClient).databaseClient, - path: collc.(*collectionClient).path + "/colls/" + collid, - } -} - -func (c *monitorDocumentClient) all(ctx context.Context, i MonitorDocumentIterator) (*pkg.MonitorDocuments, error) { - allmonitorDocuments := &pkg.MonitorDocuments{} - - for { - monitorDocuments, err := i.Next(ctx, -1) - if err != nil { - return nil, err - } - if monitorDocuments == nil { - break - } - - allmonitorDocuments.Count += monitorDocuments.Count - allmonitorDocuments.ResourceID = monitorDocuments.ResourceID - allmonitorDocuments.MonitorDocuments = append(allmonitorDocuments.MonitorDocuments, monitorDocuments.MonitorDocuments...) - } - - return allmonitorDocuments, nil -} - -func (c *monitorDocumentClient) Create(ctx context.Context, partitionkey string, newmonitorDocument *pkg.MonitorDocument, options *Options) (monitorDocument *pkg.MonitorDocument, err error) { - headers := http.Header{} - headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) - - if options == nil { - options = &Options{} - } - options.NoETag = true - - err = c.setOptions(options, newmonitorDocument, headers) - if err != nil { - return - } - - err = c.do(ctx, http.MethodPost, c.path+"/docs", "docs", c.path, http.StatusCreated, &newmonitorDocument, &monitorDocument, headers) - return -} - -func (c *monitorDocumentClient) List(options *Options) MonitorDocumentIterator { - continuation := "" - if options != nil { - continuation = options.Continuation - } - - return &monitorDocumentListIterator{monitorDocumentClient: c, options: options, continuation: continuation} -} - -func (c *monitorDocumentClient) ListAll(ctx context.Context, options *Options) (*pkg.MonitorDocuments, error) { - return c.all(ctx, c.List(options)) -} - -func (c *monitorDocumentClient) Get(ctx context.Context, partitionkey, monitorDocumentid string, options *Options) (monitorDocument *pkg.MonitorDocument, err error) { - headers := http.Header{} - headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) - - err = c.setOptions(options, nil, headers) - if err != nil { - return - } - - err = c.do(ctx, http.MethodGet, c.path+"/docs/"+monitorDocumentid, "docs", c.path+"/docs/"+monitorDocumentid, http.StatusOK, nil, &monitorDocument, headers) - return -} - -func (c *monitorDocumentClient) Replace(ctx context.Context, partitionkey string, newmonitorDocument *pkg.MonitorDocument, options *Options) (monitorDocument *pkg.MonitorDocument, err error) { - headers := http.Header{} - headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) - - err = c.setOptions(options, newmonitorDocument, headers) - if err != nil { - return - } - - err = c.do(ctx, http.MethodPut, c.path+"/docs/"+newmonitorDocument.ID, "docs", c.path+"/docs/"+newmonitorDocument.ID, http.StatusOK, &newmonitorDocument, &monitorDocument, headers) - return -} - -func (c *monitorDocumentClient) Delete(ctx context.Context, partitionkey string, monitorDocument *pkg.MonitorDocument, options *Options) (err error) { - headers := http.Header{} - headers.Set("X-Ms-Documentdb-Partitionkey", `["`+partitionkey+`"]`) - - err = c.setOptions(options, monitorDocument, headers) - if err != nil { - return - } - - err = c.do(ctx, http.MethodDelete, c.path+"/docs/"+monitorDocument.ID, "docs", c.path+"/docs/"+monitorDocument.ID, http.StatusNoContent, nil, nil, headers) - return -} - -func (c *monitorDocumentClient) Query(partitionkey string, query *Query, options *Options) MonitorDocumentRawIterator { - continuation := "" - if options != nil { - continuation = options.Continuation - } - - return &monitorDocumentQueryIterator{monitorDocumentClient: c, partitionkey: partitionkey, query: query, options: options, continuation: continuation} -} - -func (c *monitorDocumentClient) QueryAll(ctx context.Context, partitionkey string, query *Query, options *Options) (*pkg.MonitorDocuments, error) { - return c.all(ctx, c.Query(partitionkey, query, options)) -} - -func (c *monitorDocumentClient) ChangeFeed(options *Options) MonitorDocumentIterator { - continuation := "" - if options != nil { - continuation = options.Continuation - } - - return &monitorDocumentChangeFeedIterator{monitorDocumentClient: c, options: options, continuation: continuation} -} - -func (c *monitorDocumentClient) setOptions(options *Options, monitorDocument *pkg.MonitorDocument, headers http.Header) error { - if options == nil { - return nil - } - - if monitorDocument != nil && !options.NoETag { - if monitorDocument.ETag == "" { - return ErrETagRequired - } - headers.Set("If-Match", monitorDocument.ETag) - } - if len(options.PreTriggers) > 0 { - headers.Set("X-Ms-Documentdb-Pre-Trigger-Include", strings.Join(options.PreTriggers, ",")) - } - if len(options.PostTriggers) > 0 { - headers.Set("X-Ms-Documentdb-Post-Trigger-Include", strings.Join(options.PostTriggers, ",")) - } - if len(options.PartitionKeyRangeID) > 0 { - headers.Set("X-Ms-Documentdb-PartitionKeyRangeID", options.PartitionKeyRangeID) - } - - return nil -} - -func (i *monitorDocumentChangeFeedIterator) Next(ctx context.Context, maxItemCount int) (monitorDocuments *pkg.MonitorDocuments, err error) { - headers := http.Header{} - headers.Set("A-IM", "Incremental feed") - - headers.Set("X-Ms-Max-Item-Count", strconv.Itoa(maxItemCount)) - if i.continuation != "" { - headers.Set("If-None-Match", i.continuation) - } - - err = i.setOptions(i.options, nil, headers) - if err != nil { - return - } - - err = i.do(ctx, http.MethodGet, i.path+"/docs", "docs", i.path, http.StatusOK, nil, &monitorDocuments, headers) - if IsErrorStatusCode(err, http.StatusNotModified) { - err = nil - } - if err != nil { - return - } - - i.continuation = headers.Get("Etag") - - return -} - -func (i *monitorDocumentChangeFeedIterator) Continuation() string { - return i.continuation -} - -func (i *monitorDocumentListIterator) Next(ctx context.Context, maxItemCount int) (monitorDocuments *pkg.MonitorDocuments, err error) { - if i.done { - return - } - - headers := http.Header{} - headers.Set("X-Ms-Max-Item-Count", strconv.Itoa(maxItemCount)) - if i.continuation != "" { - headers.Set("X-Ms-Continuation", i.continuation) - } - - err = i.setOptions(i.options, nil, headers) - if err != nil { - return - } - - err = i.do(ctx, http.MethodGet, i.path+"/docs", "docs", i.path, http.StatusOK, nil, &monitorDocuments, headers) - if err != nil { - return - } - - i.continuation = headers.Get("X-Ms-Continuation") - i.done = i.continuation == "" - - return -} - -func (i *monitorDocumentListIterator) Continuation() string { - return i.continuation -} - -func (i *monitorDocumentQueryIterator) Next(ctx context.Context, maxItemCount int) (monitorDocuments *pkg.MonitorDocuments, err error) { - err = i.NextRaw(ctx, maxItemCount, &monitorDocuments) - return -} - -func (i *monitorDocumentQueryIterator) NextRaw(ctx context.Context, maxItemCount int, raw interface{}) (err error) { - if i.done { - return - } - - headers := http.Header{} - headers.Set("X-Ms-Max-Item-Count", strconv.Itoa(maxItemCount)) - headers.Set("X-Ms-Documentdb-Isquery", "True") - headers.Set("Content-Type", "application/query+json") - if i.partitionkey != "" { - headers.Set("X-Ms-Documentdb-Partitionkey", `["`+i.partitionkey+`"]`) - } else { - headers.Set("X-Ms-Documentdb-Query-Enablecrosspartition", "True") - } - if i.continuation != "" { - headers.Set("X-Ms-Continuation", i.continuation) - } - - err = i.setOptions(i.options, nil, headers) - if err != nil { - return - } - - err = i.do(ctx, http.MethodPost, i.path+"/docs", "docs", i.path, http.StatusOK, &i.query, &raw, headers) - if err != nil { - return - } - - i.continuation = headers.Get("X-Ms-Continuation") - i.done = i.continuation == "" - - return -} - -func (i *monitorDocumentQueryIterator) Continuation() string { - return i.continuation -} diff --git a/pkg/database/cosmosdb/zz_generated_monitordocument_fake.go b/pkg/database/cosmosdb/zz_generated_monitordocument_fake.go deleted file mode 100644 index 07691b4f841..00000000000 --- a/pkg/database/cosmosdb/zz_generated_monitordocument_fake.go +++ /dev/null @@ -1,389 +0,0 @@ -// Code generated by github.com/bennerv/go-cosmosdb, DO NOT EDIT. - -package cosmosdb - -import ( - "context" - "fmt" - "net/http" - "sync" - - "github.com/ugorji/go/codec" - - pkg "github.com/Azure/ARO-RP/pkg/api" -) - -type ( - fakeMonitorDocumentTriggerHandler func(context.Context, *pkg.MonitorDocument) error - fakeMonitorDocumentQueryHandler func(MonitorDocumentClient, *Query, *Options) MonitorDocumentRawIterator -) - -var _ MonitorDocumentClient = &FakeMonitorDocumentClient{} - -// NewFakeMonitorDocumentClient returns a FakeMonitorDocumentClient -func NewFakeMonitorDocumentClient(h *codec.JsonHandle) *FakeMonitorDocumentClient { - return &FakeMonitorDocumentClient{ - jsonHandle: h, - monitorDocuments: make(map[string]*pkg.MonitorDocument), - triggerHandlers: make(map[string]fakeMonitorDocumentTriggerHandler), - queryHandlers: make(map[string]fakeMonitorDocumentQueryHandler), - } -} - -// FakeMonitorDocumentClient is a FakeMonitorDocumentClient -type FakeMonitorDocumentClient struct { - lock sync.RWMutex - jsonHandle *codec.JsonHandle - monitorDocuments map[string]*pkg.MonitorDocument - triggerHandlers map[string]fakeMonitorDocumentTriggerHandler - queryHandlers map[string]fakeMonitorDocumentQueryHandler - sorter func([]*pkg.MonitorDocument) - etag int - changeFeedIterators []*fakeMonitorDocumentIterator - - // returns true if documents conflict - conflictChecker func(*pkg.MonitorDocument, *pkg.MonitorDocument) bool - - // err, if not nil, is an error to return when attempting to communicate - // with this Client - err error -} - -// SetError sets or unsets an error that will be returned on any -// FakeMonitorDocumentClient method invocation -func (c *FakeMonitorDocumentClient) SetError(err error) { - c.lock.Lock() - defer c.lock.Unlock() - - c.err = err -} - -// SetSorter sets or unsets a sorter function which will be used to sort values -// returned by List() for test stability -func (c *FakeMonitorDocumentClient) SetSorter(sorter func([]*pkg.MonitorDocument)) { - c.lock.Lock() - defer c.lock.Unlock() - - c.sorter = sorter -} - -// SetConflictChecker sets or unsets a function which can be used to validate -// additional unique keys in a MonitorDocument -func (c *FakeMonitorDocumentClient) SetConflictChecker(conflictChecker func(*pkg.MonitorDocument, *pkg.MonitorDocument) bool) { - c.lock.Lock() - defer c.lock.Unlock() - - c.conflictChecker = conflictChecker -} - -// SetTriggerHandler sets or unsets a trigger handler -func (c *FakeMonitorDocumentClient) SetTriggerHandler(triggerName string, trigger fakeMonitorDocumentTriggerHandler) { - c.lock.Lock() - defer c.lock.Unlock() - - c.triggerHandlers[triggerName] = trigger -} - -// SetQueryHandler sets or unsets a query handler -func (c *FakeMonitorDocumentClient) SetQueryHandler(queryName string, query fakeMonitorDocumentQueryHandler) { - c.lock.Lock() - defer c.lock.Unlock() - - c.queryHandlers[queryName] = query -} - -func (c *FakeMonitorDocumentClient) deepCopy(monitorDocument *pkg.MonitorDocument) (*pkg.MonitorDocument, error) { - var b []byte - err := codec.NewEncoderBytes(&b, c.jsonHandle).Encode(monitorDocument) - if err != nil { - return nil, err - } - - monitorDocument = nil - err = codec.NewDecoderBytes(b, c.jsonHandle).Decode(&monitorDocument) - if err != nil { - return nil, err - } - - return monitorDocument, nil -} - -func (c *FakeMonitorDocumentClient) apply(ctx context.Context, partitionkey string, monitorDocument *pkg.MonitorDocument, options *Options, isCreate bool) (*pkg.MonitorDocument, error) { - c.lock.Lock() - defer c.lock.Unlock() - - if c.err != nil { - return nil, c.err - } - - monitorDocument, err := c.deepCopy(monitorDocument) // copy now because pretriggers can mutate monitorDocument - if err != nil { - return nil, err - } - - if options != nil { - err := c.processPreTriggers(ctx, monitorDocument, options) - if err != nil { - return nil, err - } - } - - existingMonitorDocument, exists := c.monitorDocuments[monitorDocument.ID] - if isCreate && exists { - return nil, &Error{ - StatusCode: http.StatusConflict, - Message: "Entity with the specified id already exists in the system", - } - } - if !isCreate { - if !exists { - return nil, &Error{StatusCode: http.StatusNotFound} - } - - if (options == nil || !options.NoETag) && monitorDocument.ETag != existingMonitorDocument.ETag { - return nil, &Error{StatusCode: http.StatusPreconditionFailed} - } - } - - if c.conflictChecker != nil { - for _, monitorDocumentToCheck := range c.monitorDocuments { - if c.conflictChecker(monitorDocumentToCheck, monitorDocument) { - return nil, &Error{ - StatusCode: http.StatusConflict, - Message: "Entity with the specified id already exists in the system", - } - } - } - } - - monitorDocument.ETag = fmt.Sprint(c.etag) - c.etag++ - - c.monitorDocuments[monitorDocument.ID] = monitorDocument - - if err = c.updateChangeFeeds(monitorDocument); err != nil { - return nil, err - } - - return c.deepCopy(monitorDocument) -} - -// Create creates a MonitorDocument in the database -func (c *FakeMonitorDocumentClient) Create(ctx context.Context, partitionkey string, monitorDocument *pkg.MonitorDocument, options *Options) (*pkg.MonitorDocument, error) { - return c.apply(ctx, partitionkey, monitorDocument, options, true) -} - -// Replace replaces a MonitorDocument in the database -func (c *FakeMonitorDocumentClient) Replace(ctx context.Context, partitionkey string, monitorDocument *pkg.MonitorDocument, options *Options) (*pkg.MonitorDocument, error) { - return c.apply(ctx, partitionkey, monitorDocument, options, false) -} - -// List returns a MonitorDocumentIterator to list all MonitorDocuments in the database -func (c *FakeMonitorDocumentClient) List(*Options) MonitorDocumentIterator { - c.lock.RLock() - defer c.lock.RUnlock() - - if c.err != nil { - return NewFakeMonitorDocumentErroringRawIterator(c.err) - } - - monitorDocuments := make([]*pkg.MonitorDocument, 0, len(c.monitorDocuments)) - for _, monitorDocument := range c.monitorDocuments { - monitorDocument, err := c.deepCopy(monitorDocument) - if err != nil { - return NewFakeMonitorDocumentErroringRawIterator(err) - } - monitorDocuments = append(monitorDocuments, monitorDocument) - } - - if c.sorter != nil { - c.sorter(monitorDocuments) - } - - return NewFakeMonitorDocumentIterator(monitorDocuments, 0) -} - -// ListAll lists all MonitorDocuments in the database -func (c *FakeMonitorDocumentClient) ListAll(ctx context.Context, options *Options) (*pkg.MonitorDocuments, error) { - iter := c.List(options) - return iter.Next(ctx, -1) -} - -// Get gets a MonitorDocument from the database -func (c *FakeMonitorDocumentClient) Get(ctx context.Context, partitionkey string, id string, options *Options) (*pkg.MonitorDocument, error) { - c.lock.RLock() - defer c.lock.RUnlock() - - if c.err != nil { - return nil, c.err - } - - monitorDocument, exists := c.monitorDocuments[id] - if !exists { - return nil, &Error{StatusCode: http.StatusNotFound} - } - - return c.deepCopy(monitorDocument) -} - -// Delete deletes a MonitorDocument from the database -func (c *FakeMonitorDocumentClient) Delete(ctx context.Context, partitionKey string, monitorDocument *pkg.MonitorDocument, options *Options) error { - c.lock.Lock() - defer c.lock.Unlock() - - if c.err != nil { - return c.err - } - - _, exists := c.monitorDocuments[monitorDocument.ID] - if !exists { - return &Error{StatusCode: http.StatusNotFound} - } - - delete(c.monitorDocuments, monitorDocument.ID) - return nil -} - -// ChangeFeed is a basic implementation of cosmosDB Changefeeds. Compared to the real changefeeds, its implementation is much more simplistic: -// - Deleting a MonitorDocument does not remove it from the existing change feeds -// - when a MonitorDocument is pushed into the changefeed, older versions that have not been retrieved won't be removed, meaning there's no guarantee that a monitorDocument from the changefeed is actually the most recent version. -func (c *FakeMonitorDocumentClient) ChangeFeed(*Options) MonitorDocumentIterator { - c.lock.RLock() - defer c.lock.RUnlock() - - if c.err != nil { - return NewFakeMonitorDocumentErroringRawIterator(c.err) - } - - newIter, ok := c.List(nil).(*fakeMonitorDocumentIterator) - if !ok { - return NewFakeMonitorDocumentErroringRawIterator(fmt.Errorf("internal error")) - } - - c.changeFeedIterators = append(c.changeFeedIterators, newIter) - return newIter -} - -func (c *FakeMonitorDocumentClient) updateChangeFeeds(monitorDocument *pkg.MonitorDocument) error { - for _, currentIterator := range c.changeFeedIterators { - newTpl, err := c.deepCopy(monitorDocument) - if err != nil { - return err - } - - currentIterator.monitorDocuments = append(currentIterator.monitorDocuments, newTpl) - currentIterator.done = false - } - return nil -} - -func (c *FakeMonitorDocumentClient) processPreTriggers(ctx context.Context, monitorDocument *pkg.MonitorDocument, options *Options) error { - for _, triggerName := range options.PreTriggers { - if triggerHandler := c.triggerHandlers[triggerName]; triggerHandler != nil { - c.lock.Unlock() - err := triggerHandler(ctx, monitorDocument) - c.lock.Lock() - if err != nil { - return err - } - } else { - return ErrNotImplemented - } - } - - return nil -} - -// Query calls a query handler to implement database querying -func (c *FakeMonitorDocumentClient) Query(name string, query *Query, options *Options) MonitorDocumentRawIterator { - c.lock.RLock() - defer c.lock.RUnlock() - - if c.err != nil { - return NewFakeMonitorDocumentErroringRawIterator(c.err) - } - - if queryHandler := c.queryHandlers[query.Query]; queryHandler != nil { - c.lock.RUnlock() - i := queryHandler(c, query, options) - c.lock.RLock() - return i - } - - return NewFakeMonitorDocumentErroringRawIterator(ErrNotImplemented) -} - -// QueryAll calls a query handler to implement database querying -func (c *FakeMonitorDocumentClient) QueryAll(ctx context.Context, partitionkey string, query *Query, options *Options) (*pkg.MonitorDocuments, error) { - iter := c.Query("", query, options) - return iter.Next(ctx, -1) -} - -func NewFakeMonitorDocumentIterator(monitorDocuments []*pkg.MonitorDocument, continuation int) MonitorDocumentRawIterator { - return &fakeMonitorDocumentIterator{monitorDocuments: monitorDocuments, continuation: continuation} -} - -type fakeMonitorDocumentIterator struct { - monitorDocuments []*pkg.MonitorDocument - continuation int - done bool -} - -func (i *fakeMonitorDocumentIterator) NextRaw(ctx context.Context, maxItemCount int, out interface{}) error { - return ErrNotImplemented -} - -func (i *fakeMonitorDocumentIterator) Next(ctx context.Context, maxItemCount int) (*pkg.MonitorDocuments, error) { - if i.done { - return nil, nil - } - - var monitorDocuments []*pkg.MonitorDocument - if maxItemCount == -1 { - monitorDocuments = i.monitorDocuments[i.continuation:] - i.continuation = len(i.monitorDocuments) - i.done = true - } else { - max := i.continuation + maxItemCount - if max > len(i.monitorDocuments) { - max = len(i.monitorDocuments) - } - monitorDocuments = i.monitorDocuments[i.continuation:max] - i.continuation = max - i.done = i.Continuation() == "" - } - - return &pkg.MonitorDocuments{ - MonitorDocuments: monitorDocuments, - Count: len(monitorDocuments), - }, nil -} - -func (i *fakeMonitorDocumentIterator) Continuation() string { - if i.continuation >= len(i.monitorDocuments) { - return "" - } - return fmt.Sprintf("%d", i.continuation) -} - -// NewFakeMonitorDocumentErroringRawIterator returns a MonitorDocumentRawIterator which -// whose methods return the given error -func NewFakeMonitorDocumentErroringRawIterator(err error) MonitorDocumentRawIterator { - return &fakeMonitorDocumentErroringRawIterator{err: err} -} - -type fakeMonitorDocumentErroringRawIterator struct { - err error -} - -func (i *fakeMonitorDocumentErroringRawIterator) Next(ctx context.Context, maxItemCount int) (*pkg.MonitorDocuments, error) { - return nil, i.err -} - -func (i *fakeMonitorDocumentErroringRawIterator) NextRaw(context.Context, int, interface{}) error { - return i.err -} - -func (i *fakeMonitorDocumentErroringRawIterator) Continuation() string { - return "" -} diff --git a/pkg/database/database.go b/pkg/database/database.go index de18f0c9b35..d4d7709daaf 100644 --- a/pkg/database/database.go +++ b/pkg/database/database.go @@ -29,7 +29,6 @@ const ( collAsyncOperations = "AsyncOperations" collBilling = "Billing" collGateway = "Gateway" - collMonitors = "Monitors" collOpenShiftClusters = "OpenShiftClusters" collOpenShiftVersion = "OpenShiftVersions" collPlatformWorkloadIdentityRoleSet = "PlatformWorkloadIdentityRoleSets" diff --git a/pkg/database/dbgroup.go b/pkg/database/dbgroup.go index 9ed475635a5..4ec54fc1dec 100644 --- a/pkg/database/dbgroup.go +++ b/pkg/database/dbgroup.go @@ -13,10 +13,6 @@ type DatabaseGroupWithSubscriptions interface { Subscriptions() (Subscriptions, error) } -type DatabaseGroupWithMonitors interface { - Monitors() (Monitors, error) -} - type DatabaseGroupWithOpenShiftVersions interface { OpenShiftVersions() (OpenShiftVersions, error) } @@ -52,7 +48,6 @@ type DatabaseGroupWithPoolWorkers interface { type DatabaseGroup interface { DatabaseGroupWithOpenShiftClusters DatabaseGroupWithSubscriptions - DatabaseGroupWithMonitors DatabaseGroupWithOpenShiftVersions DatabaseGroupWithPlatformWorkloadIdentityRoleSets DatabaseGroupWithAsyncOperations @@ -64,7 +59,6 @@ type DatabaseGroup interface { WithOpenShiftClusters(db OpenShiftClusters) DatabaseGroup WithSubscriptions(db Subscriptions) DatabaseGroup - WithMonitors(db Monitors) DatabaseGroup WithOpenShiftVersions(db OpenShiftVersions) DatabaseGroup WithPlatformWorkloadIdentityRoleSets(db PlatformWorkloadIdentityRoleSets) DatabaseGroup WithAsyncOperations(db AsyncOperations) DatabaseGroup @@ -78,7 +72,6 @@ type DatabaseGroup interface { type dbGroup struct { openShiftClusters OpenShiftClusters subscriptions Subscriptions - monitors Monitors platformWorkloadIdentityRoleSets PlatformWorkloadIdentityRoleSets openShiftVersions OpenShiftVersions asyncOperations AsyncOperations @@ -113,18 +106,6 @@ func (d *dbGroup) WithSubscriptions(db Subscriptions) DatabaseGroup { return d } -func (d *dbGroup) Monitors() (Monitors, error) { - if d.monitors == nil { - return nil, errors.New("no Monitors database client set") - } - return d.monitors, nil -} - -func (d *dbGroup) WithMonitors(db Monitors) DatabaseGroup { - d.monitors = db - return d -} - func (d *dbGroup) OpenShiftVersions() (OpenShiftVersions, error) { if d.openShiftVersions == nil { return nil, errors.New("no OpenShiftVersions database client set") diff --git a/pkg/database/monitors.go b/pkg/database/monitors.go deleted file mode 100644 index b5372906979..00000000000 --- a/pkg/database/monitors.go +++ /dev/null @@ -1,169 +0,0 @@ -package database - -// Copyright (c) Microsoft Corporation. -// Licensed under the Apache License 2.0. - -import ( - "context" - "fmt" - "net/http" - "strings" - - "github.com/Azure/ARO-RP/pkg/api" - "github.com/Azure/ARO-RP/pkg/database/cosmosdb" - "github.com/Azure/ARO-RP/pkg/util/uuid" -) - -const ( - MonitorsTryLeaseQuery string = `SELECT * FROM Monitors doc WHERE doc.id = "master" AND (doc.leaseExpires ?? 0) < GetCurrentTimestamp() / 1000` - MonitorsWorkerQuery string = `SELECT * FROM Monitors doc WHERE doc.id != "master"` -) - -type monitors struct { - c cosmosdb.MonitorDocumentClient - uuid string -} - -// Monitors is the database interface for MonitorDocuments -type Monitors interface { - Create(context.Context, *api.MonitorDocument) (*api.MonitorDocument, error) - PatchWithLease(context.Context, string, func(*api.MonitorDocument) error) (*api.MonitorDocument, error) - TryLease(context.Context) (*api.MonitorDocument, error) - ListBuckets(context.Context) ([]int, error) - ListMonitors(context.Context) (*api.MonitorDocuments, error) - MonitorHeartbeat(context.Context, int) error -} - -// NewMonitors returns a new Monitors -func NewMonitors(ctx context.Context, dbc cosmosdb.DatabaseClient, dbName string) (Monitors, error) { - collc := cosmosdb.NewCollectionClient(dbc, dbName) - - return &monitors{ - c: cosmosdb.NewMonitorDocumentClient(collc, collMonitors), - uuid: uuid.DefaultGenerator.Generate(), - }, nil -} - -func NewMonitorsWithProvidedClient(client cosmosdb.MonitorDocumentClient, uuid string) Monitors { - return &monitors{ - c: client, - uuid: uuid, - } -} - -func (c *monitors) Create(ctx context.Context, doc *api.MonitorDocument) (*api.MonitorDocument, error) { - if doc.ID != strings.ToLower(doc.ID) { - return nil, fmt.Errorf("id %q is not lower case", doc.ID) - } - - doc, err := c.c.Create(ctx, doc.ID, doc, nil) - - if err, ok := err.(*cosmosdb.Error); ok && err.StatusCode == http.StatusConflict { - err.StatusCode = http.StatusPreconditionFailed - } - - return doc, err -} - -func (c *monitors) get(ctx context.Context, id string) (*api.MonitorDocument, error) { - if id != strings.ToLower(id) { - return nil, fmt.Errorf("id %q is not lower case", id) - } - - return c.c.Get(ctx, id, id, nil) -} - -func (c *monitors) patch(ctx context.Context, id string, f func(*api.MonitorDocument) error, options *cosmosdb.Options) (*api.MonitorDocument, error) { - var doc *api.MonitorDocument - - err := cosmosdb.RetryOnPreconditionFailed(func() (err error) { - doc, err = c.get(ctx, id) - if err != nil { - return - } - - err = f(doc) - if err != nil { - return - } - - doc, err = c.update(ctx, doc, options) - return - }) - - return doc, err -} - -func (c *monitors) PatchWithLease(ctx context.Context, id string, f func(*api.MonitorDocument) error) (*api.MonitorDocument, error) { - return c.patch(ctx, id, func(doc *api.MonitorDocument) error { - if doc.LeaseOwner != c.uuid { - return fmt.Errorf("lost lease") - } - - return f(doc) - }, &cosmosdb.Options{PreTriggers: []string{"renewLease"}}) -} - -func (c *monitors) update(ctx context.Context, doc *api.MonitorDocument, options *cosmosdb.Options) (*api.MonitorDocument, error) { - if doc.ID != strings.ToLower(doc.ID) { - return nil, fmt.Errorf("id %q is not lower case", doc.ID) - } - - return c.c.Replace(ctx, doc.ID, doc, options) -} - -func (c *monitors) TryLease(ctx context.Context) (*api.MonitorDocument, error) { - docs, err := c.c.QueryAll(ctx, "", &cosmosdb.Query{ - Query: MonitorsTryLeaseQuery, - }, nil) - if err != nil { - return nil, err - } - if docs == nil { - return nil, nil - } - - for _, doc := range docs.MonitorDocuments { - doc.LeaseOwner = c.uuid - doc, err = c.update(ctx, doc, &cosmosdb.Options{PreTriggers: []string{"renewLease"}}) - if cosmosdb.IsErrorStatusCode(err, http.StatusPreconditionFailed) { // someone else got there first - continue - } - return doc, err - } - - return nil, nil -} - -func (c *monitors) ListBuckets(ctx context.Context) (buckets []int, err error) { - doc, err := c.get(ctx, "master") - if err != nil || doc == nil || doc.Monitor == nil { - return nil, err - } - - for i, monitor := range doc.Monitor.Buckets { - if monitor == c.uuid { - buckets = append(buckets, i) - } - } - - return buckets, nil -} - -func (c *monitors) ListMonitors(ctx context.Context) (*api.MonitorDocuments, error) { - return c.c.QueryAll(ctx, "", &cosmosdb.Query{ - Query: MonitorsWorkerQuery, - }, nil) -} - -func (c *monitors) MonitorHeartbeat(ctx context.Context, ttl int) error { - doc := &api.MonitorDocument{ - ID: c.uuid, - TTL: ttl, - } - _, err := c.update(ctx, doc, &cosmosdb.Options{NoETag: true}) - if err != nil && cosmosdb.IsErrorStatusCode(err, http.StatusNotFound) { - _, err = c.Create(ctx, doc) - } - return err -} diff --git a/pkg/deploy/generator/resources_rp.go b/pkg/deploy/generator/resources_rp.go index 7f7380c4085..9ba37de394d 100644 --- a/pkg/deploy/generator/resources_rp.go +++ b/pkg/deploy/generator/resources_rp.go @@ -982,30 +982,6 @@ func (g *generator) database(databaseName string, addDependsOn bool) []*arm.Reso }, }, gateway, - { - Resource: &sdkcosmos.SQLContainerCreateUpdateParameters{ - Properties: &sdkcosmos.SQLContainerCreateUpdateProperties{ - Resource: &sdkcosmos.SQLContainerResource{ - ID: pointerutils.ToPtr("Monitors"), - PartitionKey: &sdkcosmos.ContainerPartitionKey{ - Paths: []*string{ - pointerutils.ToPtr("/id"), - }, - Kind: &hashPartitionKey, - }, - DefaultTTL: pointerutils.ToPtr(int32(-1)), - }, - Options: &sdkcosmos.CreateUpdateOptions{}, - }, - Name: pointerutils.ToPtr("[concat(parameters('databaseAccountName'), '/', " + databaseName + ", '/Monitors')]"), - Type: pointerutils.ToPtr("Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers"), - Location: pointerutils.ToPtr("[resourceGroup().location]"), - }, - APIVersion: azureclient.APIVersion("Microsoft.DocumentDB"), - DependsOn: []string{ - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), " + databaseName + ")]", - }, - }, { Resource: &sdkcosmos.SQLContainerCreateUpdateParameters{ Properties: &sdkcosmos.SQLContainerCreateUpdateProperties{ @@ -1110,8 +1086,6 @@ func (g *generator) database(databaseName string, addDependsOn bool) []*arm.Reso g.rpCosmosDBTriggers(databaseName, "Billing", "setDeletionBillingTimeStamp", setDeletionBillingTimeStampTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationReplace), // OpenShiftClusters g.rpCosmosDBTriggers(databaseName, "OpenShiftClusters", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), - // Monitors - g.rpCosmosDBTriggers(databaseName, "Monitors", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), // PoolWorkers g.rpCosmosDBTriggers(databaseName, "PoolWorkers", "renewLease", renewLeaseTriggerFunction, sdkcosmos.TriggerTypePre, sdkcosmos.TriggerOperationAll), // MIMO DB triggers diff --git a/test/database/inmemory.go b/test/database/inmemory.go index 262e86cf166..7c76d40d8ee 100644 --- a/test/database/inmemory.go +++ b/test/database/inmemory.go @@ -40,17 +40,6 @@ func NewFakeSubscriptions() (db database.Subscriptions, client *cosmosdb.FakeSub return db, client } -func NewFakeMonitors(now func() time.Time) (db database.Monitors, client *cosmosdb.FakeMonitorDocumentClient) { - client = cosmosdb.NewFakeMonitorDocumentClient(jsonHandle) - injectMonitors(client, now) - db = database.NewMonitorsWithProvidedClient(client, uuid.DefaultGenerator.Generate()) - return db, client -} - -func NewFakeMonitorWithExistingClient(client *cosmosdb.FakeMonitorDocumentClient) database.Monitors { - return database.NewMonitorsWithProvidedClient(client, uuid.DefaultGenerator.Generate()) -} - func NewFakePoolWorkers(now func() time.Time) (db database.PoolWorkers, client *cosmosdb.FakePoolWorkerDocumentClient) { client = cosmosdb.NewFakePoolWorkerDocumentClient(jsonHandle) injectPoolWorkers(client, now) diff --git a/test/database/monitors.go b/test/database/monitors.go deleted file mode 100644 index 119379d87c8..00000000000 --- a/test/database/monitors.go +++ /dev/null @@ -1,76 +0,0 @@ -package database - -// Copyright (c) Microsoft Corporation. -// Licensed under the Apache License 2.0. - -import ( - "context" - "slices" - "time" - - "github.com/Azure/ARO-RP/pkg/api" - "github.com/Azure/ARO-RP/pkg/database" - "github.com/Azure/ARO-RP/pkg/database/cosmosdb" -) - -func fakeMonitoringRenewLeaseTrigger(_ context.Context, doc *api.MonitorDocument, now func() time.Time) error { - doc.LeaseExpires = int(now().Unix()) + 60 - return nil -} - -func fakeMonitorGetMasterQuery(client cosmosdb.MonitorDocumentClient, _ *cosmosdb.Query, opts *cosmosdb.Options, now func() time.Time) cosmosdb.MonitorDocumentRawIterator { - input, err := client.ListAll(context.Background(), opts) - if err != nil { - // TODO: should this never happen? - panic(err) - } - - out := []*api.MonitorDocument{} - for _, r := range input.MonitorDocuments { - if r.ID != "master" { - continue - } - if time.Unix(int64(r.LeaseExpires), 0).After(now()) { - continue - } - out = append(out, r) - } - - return cosmosdb.NewFakeMonitorDocumentIterator(out, 0) -} - -func fakeMonitorGetAllButMasterHandler(client cosmosdb.MonitorDocumentClient, _ *cosmosdb.Query, opts *cosmosdb.Options, now func() time.Time) cosmosdb.MonitorDocumentRawIterator { - input, err := client.ListAll(context.Background(), opts) - if err != nil { - // TODO: should this never happen? - panic(err) - } - if input == nil { - return cosmosdb.NewFakeMonitorDocumentIterator(nil, 0) - } - - out := []*api.MonitorDocument{} - for _, r := range input.MonitorDocuments { - if r.ID == "master" { - continue - } - // XXX: This does not test for TTL -- we need to add saving a Timestamp to gocosmosdb - out = append(out, r) - } - return cosmosdb.NewFakeMonitorDocumentIterator(out, 0) -} - -func injectMonitors(c *cosmosdb.FakeMonitorDocumentClient, now func() time.Time) { - c.SetQueryHandler(database.MonitorsTryLeaseQuery, func(client cosmosdb.MonitorDocumentClient, query *cosmosdb.Query, opts *cosmosdb.Options) cosmosdb.MonitorDocumentRawIterator { - return fakeMonitorGetMasterQuery(client, query, opts, now) - }) - c.SetQueryHandler(database.MonitorsWorkerQuery, func(client cosmosdb.MonitorDocumentClient, query *cosmosdb.Query, opts *cosmosdb.Options) cosmosdb.MonitorDocumentRawIterator { - return fakeMonitorGetAllButMasterHandler(client, query, opts, now) - }) - c.SetTriggerHandler("renewLease", func(ctx context.Context, doc *api.MonitorDocument) error { - return fakeMonitoringRenewLeaseTrigger(ctx, doc, now) - }) - c.SetSorter(func(in []*api.MonitorDocument) { - slices.SortFunc(in, func(a, b *api.MonitorDocument) int { return CompareIDable(a, b) }) - }) -} From 2390726f679068178dcc31fd110b1758881d85c0 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 17:03:35 +1100 Subject: [PATCH 12/20] regen --- pkg/deploy/assets/databases-development.json | 40 ------------------- pkg/deploy/assets/rp-production.json | 42 -------------------- 2 files changed, 82 deletions(-) diff --git a/pkg/deploy/assets/databases-development.json b/pkg/deploy/assets/databases-development.json index 8350bf2a355..b6399d7e60d 100644 --- a/pkg/deploy/assets/databases-development.json +++ b/pkg/deploy/assets/databases-development.json @@ -135,28 +135,6 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" }, - { - "apiVersion": "2023-04-15", - "dependsOn": [ - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), parameters('databaseName'))]" - ], - "location": "[resourceGroup().location]", - "name": "[concat(parameters('databaseAccountName'), '/', parameters('databaseName'), '/Monitors')]", - "properties": { - "options": {}, - "resource": { - "defaultTtl": -1, - "id": "Monitors", - "partitionKey": { - "kind": "Hash", - "paths": [ - "/id" - ] - } - } - }, - "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" - }, { "apiVersion": "2023-04-15", "dependsOn": [ @@ -396,24 +374,6 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" }, - { - "apiVersion": "2023-04-15", - "dependsOn": [ - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), parameters('databaseName'))]", - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers', parameters('databaseAccountName'), parameters('databaseName'), 'Monitors')]" - ], - "location": "[resourceGroup().location]", - "name": "[concat(parameters('databaseAccountName'), '/', parameters('databaseName'), '/Monitors/renewLease')]", - "properties": { - "resource": { - "body": "function trigger() {\n\t\t\t\tvar request = getContext().getRequest();\n\t\t\t\tvar body = request.getBody();\n\t\t\t\tvar date = new Date();\n\t\t\t\tbody[\"leaseExpires\"] = Math.floor(date.getTime() / 1000) + 60;\n\t\t\t\trequest.setBody(body);\n\t\t\t}", - "id": "renewLease", - "triggerOperation": "All", - "triggerType": "Pre" - } - }, - "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" - }, { "apiVersion": "2023-04-15", "dependsOn": [ diff --git a/pkg/deploy/assets/rp-production.json b/pkg/deploy/assets/rp-production.json index b10d259ffa1..aab28397aef 100644 --- a/pkg/deploy/assets/rp-production.json +++ b/pkg/deploy/assets/rp-production.json @@ -865,29 +865,6 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" }, - { - "apiVersion": "2023-04-15", - "dependsOn": [ - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), 'ARO')]", - "[resourceId('Microsoft.DocumentDB/databaseAccounts', parameters('databaseAccountName'))]" - ], - "location": "[resourceGroup().location]", - "name": "[concat(parameters('databaseAccountName'), '/', 'ARO', '/Monitors')]", - "properties": { - "options": {}, - "resource": { - "defaultTtl": -1, - "id": "Monitors", - "partitionKey": { - "kind": "Hash", - "paths": [ - "/id" - ] - } - } - }, - "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers" - }, { "apiVersion": "2023-04-15", "dependsOn": [ @@ -1144,25 +1121,6 @@ }, "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" }, - { - "apiVersion": "2023-04-15", - "dependsOn": [ - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases', parameters('databaseAccountName'), 'ARO')]", - "[resourceId('Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers', parameters('databaseAccountName'), 'ARO', 'Monitors')]", - "[resourceId('Microsoft.DocumentDB/databaseAccounts', parameters('databaseAccountName'))]" - ], - "location": "[resourceGroup().location]", - "name": "[concat(parameters('databaseAccountName'), '/', 'ARO', '/Monitors/renewLease')]", - "properties": { - "resource": { - "body": "function trigger() {\n\t\t\t\tvar request = getContext().getRequest();\n\t\t\t\tvar body = request.getBody();\n\t\t\t\tvar date = new Date();\n\t\t\t\tbody[\"leaseExpires\"] = Math.floor(date.getTime() / 1000) + 60;\n\t\t\t\trequest.setBody(body);\n\t\t\t}", - "id": "renewLease", - "triggerOperation": "All", - "triggerType": "Pre" - } - }, - "type": "Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/triggers" - }, { "apiVersion": "2023-04-15", "dependsOn": [ From ab81e24db5970d4a5ee1d536b79f21014904cf1e Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 17:09:00 +1100 Subject: [PATCH 13/20] move the master doc priming into the buckets code --- pkg/monitor/monitor.go | 18 ------------------ pkg/util/buckets/balancer.go | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go index 124117b4cd3..3ad3d2c0859 100644 --- a/pkg/monitor/monitor.go +++ b/pkg/monitor/monitor.go @@ -7,7 +7,6 @@ import ( "context" "fmt" "math/rand" - "net/http" "sync" "sync/atomic" "time" @@ -18,7 +17,6 @@ import ( "github.com/Azure/ARO-RP/pkg/api" "github.com/Azure/ARO-RP/pkg/database" - "github.com/Azure/ARO-RP/pkg/database/cosmosdb" "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/hive" "github.com/Azure/ARO-RP/pkg/metrics" @@ -135,19 +133,6 @@ func (mon *monitor) Run(ctx context.Context) error { mon.hiveClusterManagers[1] = cl } - // We always need a master document to exist so that we can attempt to - // dequeue it. If it already exists we will get a StatusPreconditionFailed - // error, which is expected and we can ignore. The leasing of the master - // document is in `mon.master()`. - _, err = dbPoolWorkers.Create(ctx, api.PoolWorkerTypeMonitor, &api.PoolWorkerDocument{ - ID: string(api.PoolWorkerTypeMonitor), - WorkerType: api.PoolWorkerTypeMonitor, - }) - if err != nil && !cosmosdb.IsErrorStatusCode(err, http.StatusPreconditionFailed) { - mon.baseLog.Error(fmt.Errorf("error bootstrapping master PoolWorkerDocument (not a 412): %w", err)) - return err - } - err = mon.startChangefeeds(ctx, nil) if err != nil { mon.baseLog.Error(fmt.Errorf("failed to start changefeed subscriber: %w", err)) @@ -155,9 +140,6 @@ func (mon *monitor) Run(ctx context.Context) error { } go mon.changefeedMetrics(nil) - t := time.NewTicker(mon.changefeedInterval) - defer t.Stop() - go heartbeat.EmitHeartbeat(mon.baseLog, mon.m, "monitor.heartbeat", nil, mon.checkReady) return buckets.StartBucketWorkerLoop(ctx, mon.baseLog, api.PoolWorkerTypeMonitor, mon.bucketCount, mon.changefeedInterval, dbPoolWorkers, mon.onBuckets, nil) diff --git a/pkg/util/buckets/balancer.go b/pkg/util/buckets/balancer.go index 2b39e781062..51d15079d09 100644 --- a/pkg/util/buckets/balancer.go +++ b/pkg/util/buckets/balancer.go @@ -3,12 +3,14 @@ package buckets import ( "context" "fmt" + "net/http" "time" "github.com/sirupsen/logrus" "github.com/Azure/ARO-RP/pkg/api" "github.com/Azure/ARO-RP/pkg/database" + "github.com/Azure/ARO-RP/pkg/database/cosmosdb" ) func StartBucketWorkerLoop( @@ -24,6 +26,19 @@ func StartBucketWorkerLoop( t := time.NewTicker(interval) defer t.Stop() + // We always need a master document to exist so that we can attempt to + // dequeue it. If it already exists we will get a StatusPreconditionFailed + // error, which is expected and we can ignore. The leasing of the master + // document is in `tryMaster()`. + _, err := dbPoolWorkers.Create(ctx, workerType, &api.PoolWorkerDocument{ + ID: string(workerType), + WorkerType: workerType, + }) + if err != nil && !cosmosdb.IsErrorStatusCode(err, http.StatusPreconditionFailed) { + log.Error(fmt.Errorf("error bootstrapping master PoolWorkerDocument (not a 412): %w", err)) + return err + } + isMaster := false for { // register ourself as a monitor, ttl of 60s default From 849d0c5c9fe72fe6dadf58fa58b51754a8779dd0 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 17:10:19 +1100 Subject: [PATCH 14/20] nits --- pkg/database/poolworkers.go | 4 ++-- pkg/mimo/actuator/service.go | 4 ++-- pkg/mimo/scheduler/service.go | 7 +++---- test/database/poolworkers.go | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pkg/database/poolworkers.go b/pkg/database/poolworkers.go index f98eb22d5a2..f78b50a529d 100644 --- a/pkg/database/poolworkers.go +++ b/pkg/database/poolworkers.go @@ -15,8 +15,8 @@ import ( ) const ( - PoolWorkerGetMasterQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id = "@workerType" AND doc.poolWorker.workerType = "@workerType" AND (doc.leaseExpires ?? 0) < GetCurrentTimestamp() / 1000` - PoolWorkerGetWorkersQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id != "@workerType" AND doc.poolWorker.workerType = "@workerType"` + PoolWorkerGetMasterQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id = "@workerType" AND doc.workerType = "@workerType" AND (doc.leaseExpires ?? 0) < GetCurrentTimestamp() / 1000` + PoolWorkerGetWorkersQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id != "@workerType" AND doc.workerType = "@workerType"` ) type poolWorkers struct { diff --git a/pkg/mimo/actuator/service.go b/pkg/mimo/actuator/service.go index 2d9b72cb3a4..649c2985451 100644 --- a/pkg/mimo/actuator/service.go +++ b/pkg/mimo/actuator/service.go @@ -113,8 +113,8 @@ func NewService(env env.Interface, log *logrus.Entry, dialer proxy.Dialer, dbg a taskPollTime: 90 * time.Second, // Bucket timing is set lower to prioritise responsiveness to VM changes - bucketRefreshInterval: 30 * time.Second, - bucketRefreshReadinessInterval: 45 * time.Second, + bucketRefreshInterval: 10 * time.Second, + bucketRefreshReadinessInterval: 60 * time.Second, readyDelay: time.Minute * 2, serveHealthz: true, diff --git a/pkg/mimo/scheduler/service.go b/pkg/mimo/scheduler/service.go index 5d9c9dd872b..6151bc5880c 100644 --- a/pkg/mimo/scheduler/service.go +++ b/pkg/mimo/scheduler/service.go @@ -106,9 +106,8 @@ func NewService(env env.Interface, log *logrus.Entry, dbg schedulerDBs, m metric changefeedBatchSize: 50, changefeedInterval: 10 * time.Second, - // Bucket timing is set to prioritise responsiveness to VM changes - bucketRefreshInterval: 30 * time.Second, - bucketRefreshReadinessInterval: 45 * time.Second, + bucketRefreshInterval: 10 * time.Second, + bucketRefreshReadinessInterval: 60 * time.Second, subs: changefeed.NewSubscriptionsChangefeedCache(m, false), @@ -371,7 +370,7 @@ func (s *service) worker(stop <-chan struct{}, id string) { } ownedBuckets := make(map[string]struct{}) - for i := range _ownedBuckets { + for _, i := range _ownedBuckets { ownedBuckets[fmt.Sprintf("%d", i)] = struct{}{} } diff --git a/test/database/poolworkers.go b/test/database/poolworkers.go index 0d9a84a23e6..be6820cb9a1 100644 --- a/test/database/poolworkers.go +++ b/test/database/poolworkers.go @@ -13,7 +13,7 @@ import ( "github.com/Azure/ARO-RP/pkg/database/cosmosdb" ) -func fakePoolWorkeringRenewLeaseTrigger(_ context.Context, doc *api.PoolWorkerDocument, now func() time.Time) error { +func fakePoolWorkerRenewLeaseTrigger(_ context.Context, doc *api.PoolWorkerDocument, now func() time.Time) error { doc.LeaseExpires = int(now().Unix()) + 60 return nil } @@ -74,7 +74,7 @@ func injectPoolWorkers(c *cosmosdb.FakePoolWorkerDocumentClient, now func() time return fakePoolWorkerGetAllButMasterHandler(client, query, opts, now) }) c.SetTriggerHandler("renewLease", func(ctx context.Context, doc *api.PoolWorkerDocument) error { - return fakePoolWorkeringRenewLeaseTrigger(ctx, doc, now) + return fakePoolWorkerRenewLeaseTrigger(ctx, doc, now) }) c.SetSorter(func(in []*api.PoolWorkerDocument) { slices.SortFunc(in, func(a, b *api.PoolWorkerDocument) int { return CompareIDable(a, b) }) From 8d3f3075137aaf9e1ac382366ca8e046c93f3414 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Fri, 27 Mar 2026 17:12:40 +1100 Subject: [PATCH 15/20] ignore empty bucket allocations --- pkg/mimo/actuator/service.go | 4 ++++ pkg/mimo/scheduler/service.go | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pkg/mimo/actuator/service.go b/pkg/mimo/actuator/service.go index 649c2985451..5e380d012e1 100644 --- a/pkg/mimo/actuator/service.go +++ b/pkg/mimo/actuator/service.go @@ -194,6 +194,10 @@ func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- str go buckets.StartBucketWorkerLoop( ctx, s.baseLog, api.PoolWorkerTypeMIMOActuator, s.bucketCount, s.bucketRefreshInterval, dbPoolWorkers, func(i []int) { + if len(i) == 0 { + s.baseLog.Error("got an allocation of 0 buckets, ignoring") + return + } s.b.SetBuckets(i) s.lastBucketUpdate.Store(s.now()) }, stop, diff --git a/pkg/mimo/scheduler/service.go b/pkg/mimo/scheduler/service.go index 6151bc5880c..36bd2f1d0ba 100644 --- a/pkg/mimo/scheduler/service.go +++ b/pkg/mimo/scheduler/service.go @@ -194,6 +194,10 @@ func (s *service) Run(ctx context.Context, stop <-chan struct{}, done chan<- str go buckets.StartBucketWorkerLoop( ctx, s.baseLog, api.PoolWorkerTypeMIMOScheduler, s.bucketCount, s.bucketRefreshInterval, dbPoolWorkers, func(i []int) { + if len(i) == 0 { + s.baseLog.Error("got an allocation of 0 buckets, ignoring") + return + } s.buckets.Store(i) s.lastBucketUpdate.Store(s.now()) }, stop, From 324488de9f1d8c80fdf6bfb48d482fdf828bfb97 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 30 Mar 2026 10:11:05 +1100 Subject: [PATCH 16/20] fixes --- pkg/frontend/admin_openshiftcluster_mimo_selectors_test.go | 2 ++ pkg/util/buckets/balancer.go | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pkg/frontend/admin_openshiftcluster_mimo_selectors_test.go b/pkg/frontend/admin_openshiftcluster_mimo_selectors_test.go index 722761d43d7..c79345a893b 100644 --- a/pkg/frontend/admin_openshiftcluster_mimo_selectors_test.go +++ b/pkg/frontend/admin_openshiftcluster_mimo_selectors_test.go @@ -71,6 +71,7 @@ func TestGetMIMOSelectors(t *testing.T) { "subscriptionState": "Registered", "APIServerVisibility": "Public", "architectureVersion": "0", + "bucketID": "0", "authenticationType": "ServicePrincipal", "isManagedDomain": "false", "outboundType": "Loadbalancer", @@ -123,6 +124,7 @@ func TestGetMIMOSelectors(t *testing.T) { "subscriptionState": "Suspended", "APIServerVisibility": "Public", "architectureVersion": "0", + "bucketID": "0", "authenticationType": "ServicePrincipal", "isManagedDomain": "false", "outboundType": "Loadbalancer", diff --git a/pkg/util/buckets/balancer.go b/pkg/util/buckets/balancer.go index 51d15079d09..e7bc56b8e18 100644 --- a/pkg/util/buckets/balancer.go +++ b/pkg/util/buckets/balancer.go @@ -1,5 +1,8 @@ package buckets +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + import ( "context" "fmt" From 52cb32d350c231005d442d377d3a5b1b5c4a07bd Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 30 Mar 2026 13:28:42 +1100 Subject: [PATCH 17/20] add some logs --- pkg/util/buckets/balancer.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/util/buckets/balancer.go b/pkg/util/buckets/balancer.go index e7bc56b8e18..e8220216aa6 100644 --- a/pkg/util/buckets/balancer.go +++ b/pkg/util/buckets/balancer.go @@ -89,6 +89,7 @@ func tryMaster( if !isMaster { doc, err := dbPoolWorkers.TryLease(ctx, workerType) if err != nil || doc == nil { + log.Debugf("err: %s, doc: %#v", err, doc) return false, err } isMaster = true @@ -118,6 +119,8 @@ func tryMaster( } } + log.Debugf("workers: %v", workers) + balance(workers, bucketCount, doc) return nil }) From c06e8324616c54085f7d0042825190ed62e798bd Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 30 Mar 2026 16:06:37 +1100 Subject: [PATCH 18/20] correct some names --- pkg/util/buckets/balancer.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/util/buckets/balancer.go b/pkg/util/buckets/balancer.go index e8220216aa6..2ff5892326a 100644 --- a/pkg/util/buckets/balancer.go +++ b/pkg/util/buckets/balancer.go @@ -44,7 +44,7 @@ func StartBucketWorkerLoop( isMaster := false for { - // register ourself as a monitor, ttl of 60s default + // register ourself as a worker, ttl of 60s default err := dbPoolWorkers.PoolWorkerHeartbeat(ctx, workerType, int(interval.Seconds()*6)) if err != nil { log.Error(fmt.Errorf("error registering ourselves as a %s poolWorker, continuing: %w", workerType, err)) @@ -149,8 +149,8 @@ func balance(workers []string, bucketCount int, doc *api.PoolWorkerDocument) { var unallocated []int m := make(map[string][]int, len(workers)) // map of worker to list of buckets it owns - for _, monitor := range workers { - m[monitor] = nil + for _, worker := range workers { + m[worker] = nil } var target int // target number of buckets per worker @@ -173,7 +173,7 @@ func balance(workers []string, bucketCount int, doc *api.PoolWorkerDocument) { } } - // reallocate all unallocated buckets, appending to the least loaded monitor + // reallocate all unallocated buckets, appending to the least loaded worker if len(workers) > 0 { for _, i := range unallocated { var leastWorker string From 83d7121e10fb8673b6fdcfcea1be5d83013fca5c Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 30 Mar 2026 16:08:09 +1100 Subject: [PATCH 19/20] this should not have quotation marks --- pkg/database/poolworkers.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/database/poolworkers.go b/pkg/database/poolworkers.go index f78b50a529d..4a5ffc486d9 100644 --- a/pkg/database/poolworkers.go +++ b/pkg/database/poolworkers.go @@ -15,8 +15,8 @@ import ( ) const ( - PoolWorkerGetMasterQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id = "@workerType" AND doc.workerType = "@workerType" AND (doc.leaseExpires ?? 0) < GetCurrentTimestamp() / 1000` - PoolWorkerGetWorkersQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id != "@workerType" AND doc.workerType = "@workerType"` + PoolWorkerGetMasterQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id = @workerType AND doc.workerType = @workerType AND (doc.leaseExpires ?? 0) < GetCurrentTimestamp() / 1000` + PoolWorkerGetWorkersQuery string = `SELECT * FROM PoolWorkers doc WHERE doc.id != @workerType AND doc.workerType = @workerType` ) type poolWorkers struct { From 0b6e7fec9c7733a3aaab1d685c0b812de5196ac4 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Mon, 30 Mar 2026 16:13:11 +1100 Subject: [PATCH 20/20] we want these metrics as well --- cmd/aro/mimoactuator.go | 12 ++++++++++++ cmd/aro/mimoscheduler.go | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/cmd/aro/mimoactuator.go b/cmd/aro/mimoactuator.go index 46e62cdadcb..25aeebe6b21 100644 --- a/cmd/aro/mimoactuator.go +++ b/cmd/aro/mimoactuator.go @@ -11,10 +11,16 @@ import ( "github.com/sirupsen/logrus" + kmetrics "k8s.io/client-go/tools/metrics" + + "github.com/Azure/go-autorest/tracing" + "github.com/Azure/ARO-RP/pkg/database" "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/metrics/statsd" + "github.com/Azure/ARO-RP/pkg/metrics/statsd/azure" "github.com/Azure/ARO-RP/pkg/metrics/statsd/golang" + "github.com/Azure/ARO-RP/pkg/metrics/statsd/k8s" "github.com/Azure/ARO-RP/pkg/mimo/actuator" "github.com/Azure/ARO-RP/pkg/mimo/tasks" "github.com/Azure/ARO-RP/pkg/proxy" @@ -52,6 +58,12 @@ func mimoActuator(ctx context.Context, _log *logrus.Entry) error { } go g.Run() + tracing.Register(azure.New(m)) + kmetrics.Register(kmetrics.RegisterOpts{ + RequestResult: k8s.NewResult(m), + RequestLatency: k8s.NewLatency(m), + }) + aead, err := encryption.NewAEADWithCore(ctx, _env, env.EncryptionSecretV2Name, env.EncryptionSecretName) if err != nil { return err diff --git a/cmd/aro/mimoscheduler.go b/cmd/aro/mimoscheduler.go index ab8daa3799c..e99a6fa0c13 100644 --- a/cmd/aro/mimoscheduler.go +++ b/cmd/aro/mimoscheduler.go @@ -11,9 +11,12 @@ import ( "github.com/sirupsen/logrus" + "github.com/Azure/go-autorest/tracing" + "github.com/Azure/ARO-RP/pkg/database" "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/metrics/statsd" + "github.com/Azure/ARO-RP/pkg/metrics/statsd/azure" "github.com/Azure/ARO-RP/pkg/metrics/statsd/golang" "github.com/Azure/ARO-RP/pkg/mimo/scheduler" "github.com/Azure/ARO-RP/pkg/mimo/tasks" @@ -51,6 +54,8 @@ func mimoScheduler(ctx context.Context, _log *logrus.Entry) error { } go g.Run() + tracing.Register(azure.New(m)) + aead, err := encryption.NewAEADWithCore(ctx, _env, env.EncryptionSecretV2Name, env.EncryptionSecretName) if err != nil { return err