From bb851eca0bb7e36b9088c1b05b0e97d316ced80e Mon Sep 17 00:00:00 2001 From: Mathew Estafanous Date: Fri, 1 May 2026 15:14:55 -0400 Subject: [PATCH 1/3] feat: Support api key rotation of RC client and server --- cmd/main.go | 7 +- pkg/remoteconfig/updater.go | 274 +++++++++++++++++++++++-------- pkg/remoteconfig/updater_test.go | 231 ++++++++++++++++++++++++++ 3 files changed, 441 insertions(+), 71 deletions(-) create mode 100644 pkg/remoteconfig/updater_test.go diff --git a/cmd/main.go b/cmd/main.go index 44d1212b74..25b38b38c2 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -321,7 +321,7 @@ func run(opts *options) error { // If RBAC restricts list and watch permissions, the informer will log errors and may cause crash loops. // Reader interface as returned from mgr.GetAPIReader() reads directly from API server bypassing cache and informer initialization. credsManager := config.NewCredentialManagerWithDecryptor(mgr.GetAPIReader(), secrets.NewSecretBackend()) - creds, err := credsManager.GetCredentials() + _, err = credsManager.GetCredentials() if opts.secretRefreshInterval > 0 && opts.secretBackendCommand == "" { setupLog.Error(nil, "secretRefreshInterval is set but secretBackendCommand is not configured") @@ -340,10 +340,13 @@ func run(opts *options) error { // to handle that. <-mgr.Elected() - if rcErr := rcUpdater.Setup(creds); rcErr != nil { + if rcErr := rcUpdater.Setup(credsManager); rcErr != nil { setupErrorf(setupLog, rcErr, "Unable to set up Remote Config service") return } + if opts.secretBackendCommand != "" && opts.secretRefreshInterval > 0 { + go rcUpdater.StartCredentialWatchRoutine(credsManager, opts.secretRefreshInterval) + } if opts.remoteUpdatesEnabled { if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater.Client(), opts.createControllerRevisions && opts.datadogAgentInternalEnabled); rcErr != nil { diff --git a/pkg/remoteconfig/updater.go b/pkg/remoteconfig/updater.go index 3374dee90a..e345fa6e41 100644 --- a/pkg/remoteconfig/updater.go +++ b/pkg/remoteconfig/updater.go @@ -41,11 +41,35 @@ const ( type RemoteConfigUpdater struct { kubeClient kubeclient.Client - rcClient *client.Client - rcService *service.CoreAgentService + rcClient rcRuntimeClient + rcService rcService serviceConf RcServiceConfiguration logger logr.Logger mu sync.RWMutex + + lifecycleMu sync.Mutex + activeAPIKey string + subscriptions []rcSubscription + installerState []*pbgo.PackageState + remoteConfigFactory rcRuntimeFactory +} + +type rcRuntimeFactory func(conf RcServiceConfiguration) (rcService, rcRuntimeClient, error) + +type rcService interface { + Start() + Stop() error +} + +type rcRuntimeClient interface { + RCClient + Start() + Close() +} + +type rcSubscription struct { + product string + fn func(update map[string]state.RawConfig, applyStateCallback func(string, state.ApplyStatus)) } type RcServiceConfiguration struct { @@ -68,7 +92,7 @@ type RCClient interface { // Client returns the underlying RC client. func (r *RemoteConfigUpdater) Client() RCClient { - return r.rcClient + return r } // DatadogProductRemoteConfig is an interface for Datadog product remote configuration @@ -135,98 +159,90 @@ type dummyTelemetryReporter struct{} func (d dummyTelemetryReporter) IncRateLimit() {} func (d dummyTelemetryReporter) IncTimeout() {} -func (r *RemoteConfigUpdater) Setup(creds config.Creds) error { - apiKey := creds.APIKey - if apiKey == "" { +func (r *RemoteConfigUpdater) Setup(credsManager *config.CredentialManager) error { + creds, err := credsManager.GetCredentials() + if err != nil { + return err + } + if creds.APIKey == "" { return errors.New("error obtaining API key") } + r.lifecycleMu.Lock() + defer r.lifecycleMu.Unlock() + if r.rcClient != nil || r.rcService != nil { + return nil + } + + if err := r.startRuntime(creds.APIKey); err != nil { + return err + } + r.activeAPIKey = creds.APIKey + return nil +} + +// startRuntime will start the remote configuration client and service. This must be called a held lock. +func (r *RemoteConfigUpdater) startRuntime(apiKey string) error { site := os.Getenv(constants.DDSite) // TODO support DD_URL as well clusterName := os.Getenv(constants.DDClusterName) directorRoot := os.Getenv("DD_REMOTE_CONFIGURATION_DIRECTOR_ROOT") configRoot := os.Getenv("DD_REMOTE_CONFIGURATION_CONFIG_ROOT") endpoint := os.Getenv("DD_REMOTE_CONFIGURATION_RC_DD_URL") - if r.rcClient == nil && r.rcService == nil { - // Setup rcClient and rcService - err := r.Start(apiKey, site, clusterName, directorRoot, configRoot, endpoint) - if err != nil { - return err - } - } - - return nil - + return r.startRuntimeWithConfig(apiKey, site, clusterName, directorRoot, configRoot, endpoint) } -func (r *RemoteConfigUpdater) Start(apiKey string, site string, clusterName string, directorRoot string, configRoot string, endpoint string) error { - +func (r *RemoteConfigUpdater) startRuntimeWithConfig(apiKey string, site string, clusterName string, directorRoot string, configRoot string, endpoint string) error { r.logger.Info("Starting Remote Configuration client and service") - err := r.configureService(apiKey, site, clusterName, directorRoot, configRoot, endpoint) + serviceConf, err := r.newServiceConfiguration(apiKey, site, clusterName, directorRoot, configRoot, endpoint) if err != nil { r.logger.Error(err, "Failed to configure Remote Configuration service") return err } - rcService, err := service.NewService( - r.serviceConf.cfg, - "", - r.serviceConf.baseRawURL, - r.serviceConf.hostname, - func() []string { return []string{"cluster_name:" + r.serviceConf.clusterName} }, - r.serviceConf.telemetryReporter, - r.serviceConf.agentVersion, - service.WithAPIKey(apiKey), - service.WithDatabaseFileName(filepath.Join(r.serviceConf.rcDatabaseDir, fmt.Sprintf("remote-config-%s.db", uuid.New()))), - service.WithDirectorRootOverride(r.serviceConf.cfg.GetString("site"), r.serviceConf.cfg.GetString("remote_configuration.director_root")), - service.WithConfigRootOverride(r.serviceConf.cfg.GetString("site"), r.serviceConf.cfg.GetString("remote_configuration.config_root")), - ) + newService, newClient, err := r.remoteConfigFactory(serviceConf) if err != nil { - r.logger.Error(err, "Failed to create Remote Configuration service") + r.logger.Error(err, "Failed to create Remote Configuration runtime") return err } - r.rcService = rcService - updaterTags := []string{"updater_type:datadog-operator"} - if r.serviceConf.clusterName != "" { - updaterTags = append(updaterTags, "cluster_name:"+r.serviceConf.clusterName) - } - rcClient, err := client.NewClient( - rcService, - client.WithUpdater(updaterTags...), - client.WithProducts(state.ProductAgentConfig, state.ProductOrchestratorK8sCRDs), - client.WithDirectorRootOverride(r.serviceConf.cfg.GetString("site"), r.serviceConf.cfg.GetString("remote_configuration.director_root")), - client.WithPollInterval(pollInterval), - ) - if err != nil { - r.logger.Error(err, "Failed to create Remote Configuration client") - return err - } - r.rcClient = rcClient - rcClient.SetInstallerState([]*pbgo.PackageState{ - { - Package: "datadog-operator", - StableVersion: "0.0.1", - StableConfigVersion: "0.0.1", - }, - }) + oldSvc := r.rcService + oldClient := r.rcClient - rcService.Start() + newClient.SetInstallerState(r.installerState) + + newService.Start() r.logger.Info("Remote Configuration service started") - rcClient.Start() - r.logger.Info("Remote Configuration client started") + for _, subscription := range r.subscriptions { + newClient.Subscribe(subscription.product, subscription.fn) + } - rcClient.Subscribe(string(state.ProductAgentConfig), r.agentConfigUpdateCallback) + newClient.Start() + r.logger.Info("Remote Configuration client started") - rcClient.Subscribe(string(state.ProductOrchestratorK8sCRDs), r.crdConfigUpdateCallback) + r.serviceConf = serviceConf + r.rcService = newService + r.rcClient = newClient + + // Clean up old service/client after the new ones are setup and swapped. + if oldSvc != nil { + if err := oldSvc.Stop(); err != nil { + // The new runtime is already active; returning this error would leave + // activeAPIKey stale and cause repeated restart attempts. + r.logger.Error(err, "Failed to stop previous Remote Configuration service") + } + } + if oldClient != nil { + oldClient.Close() + } return nil } -// configureService fills the configuration needed to start the rc service -func (r *RemoteConfigUpdater) configureService(apiKey, site, clusterName, directorRoot, configRoot, endpoint string) error { +// newServiceConfiguration builds the configuration needed to start the rc service. +func (r *RemoteConfigUpdater) newServiceConfiguration(apiKey, site, clusterName, directorRoot, configRoot, endpoint string) (RcServiceConfiguration, error) { cfg := model.NewConfig("datadog", "DD", strings.NewReplacer(".", "_")) cfg.SetWithoutSource("api_key", apiKey) @@ -242,10 +258,10 @@ func (r *RemoteConfigUpdater) configureService(apiKey, site, clusterName, direct // TODO consider different dir baseDir := filepath.Join(os.TempDir(), "datadog-operator") if err := os.MkdirAll(baseDir, 0777); err != nil { - return err + return RcServiceConfiguration{}, err } - serviceConf := RcServiceConfiguration{ + return RcServiceConfiguration{ cfg: cfg, apiKey: apiKey, baseRawURL: endpoint, @@ -255,9 +271,43 @@ func (r *RemoteConfigUpdater) configureService(apiKey, site, clusterName, direct // TODO fix when other values accepted agentVersion: "7.50.0", rcDatabaseDir: baseDir, + }, nil +} + +func defaultRCFactory(conf RcServiceConfiguration) (rcService, rcRuntimeClient, error) { + rcService, err := service.NewService( + conf.cfg, + "", + conf.baseRawURL, + conf.hostname, + func() []string { return []string{"cluster_name:" + conf.clusterName} }, + conf.telemetryReporter, + conf.agentVersion, + service.WithAPIKey(conf.apiKey), + service.WithDatabaseFileName(filepath.Join(conf.rcDatabaseDir, fmt.Sprintf("remote-config-%s.db", uuid.New()))), + service.WithDirectorRootOverride(conf.cfg.GetString("site"), conf.cfg.GetString("remote_configuration.director_root")), + service.WithConfigRootOverride(conf.cfg.GetString("site"), conf.cfg.GetString("remote_configuration.config_root")), + ) + if err != nil { + return nil, nil, err } - r.serviceConf = serviceConf - return nil + + updaterTags := []string{"updater_type:datadog-operator"} + if conf.clusterName != "" { + updaterTags = append(updaterTags, "cluster_name:"+conf.clusterName) + } + rcClient, err := client.NewClient( + rcService, + client.WithUpdater(updaterTags...), + client.WithProducts(state.ProductAgentConfig, state.ProductOrchestratorK8sCRDs), + client.WithDirectorRootOverride(conf.cfg.GetString("site"), conf.cfg.GetString("remote_configuration.director_root")), + client.WithPollInterval(pollInterval), + ) + if err != nil { + return nil, nil, err + } + + return rcService, rcClient, nil } // getEndpoint returns the Remote Config endpoint, based on `site` and the prefix @@ -268,6 +318,76 @@ func getEndpoint(prefix, site string) string { return prefix + defaultSite } +func (r *RemoteConfigUpdater) Subscribe(product string, fn func(update map[string]state.RawConfig, applyStateCallback func(string, state.ApplyStatus))) { + r.lifecycleMu.Lock() + defer r.lifecycleMu.Unlock() + + subscription := rcSubscription{product: product, fn: fn} + r.subscriptions = append(r.subscriptions, subscription) + if r.rcClient != nil { + r.rcClient.Subscribe(product, fn) + } +} + +func (r *RemoteConfigUpdater) GetInstallerState() []*pbgo.PackageState { + r.lifecycleMu.Lock() + defer r.lifecycleMu.Unlock() + if r.rcClient != nil { + return r.rcClient.GetInstallerState() + } + return r.installerState +} + +func (r *RemoteConfigUpdater) SetInstallerState(packages []*pbgo.PackageState) { + r.lifecycleMu.Lock() + defer r.lifecycleMu.Unlock() + + // Save the installer state in the updater so that it can be used to restore the state of the installer + // when swapping out remote config clients. + r.installerState = packages + if r.rcClient != nil { + r.rcClient.SetInstallerState(packages) + } +} + +func (r *RemoteConfigUpdater) syncCredentials(credsManager *config.CredentialManager) error { + creds, err := credsManager.GetCredentials() + if err != nil { + return err + } + if creds.APIKey == "" { + return errors.New("error obtaining API key") + } + + r.lifecycleMu.Lock() + defer r.lifecycleMu.Unlock() + + if creds.APIKey == r.activeAPIKey { + return nil + } + + if err := r.startRuntime(creds.APIKey); err != nil { + return err + } + r.activeAPIKey = creds.APIKey + r.logger.Info("Remote Configuration credentials changed, runtime restarted") + return nil +} + +func (r *RemoteConfigUpdater) StartCredentialWatchRoutine(credsManager *config.CredentialManager, interval time.Duration) { + r.logger.Info("Starting Remote Configuration credential watch routine", "interval", interval) + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + <-ticker.C + if err := r.syncCredentials(credsManager); err != nil { + r.logger.Error(err, "Failed to sync Remote Configuration credentials") + } + } +} + // getAndUpdateDatadogAgent is used to prevent race conditions when updating the DDA's status // we do not want to modify the status without using this function or we could have conflicts func (r *RemoteConfigUpdater) getAndUpdateDatadogAgent(ctx context.Context, cfg DatadogProductRemoteConfig, f func(v2alpha1.DatadogAgent, DatadogProductRemoteConfig) error) error { @@ -585,6 +705,9 @@ func (r *RemoteConfigUpdater) updateInstanceStatus(dda v2alpha1.DatadogAgent, co } func (r *RemoteConfigUpdater) Stop() error { + r.lifecycleMu.Lock() + defer r.lifecycleMu.Unlock() + if r.rcService != nil { err := r.rcService.Stop() if err != nil { @@ -600,8 +723,21 @@ func (r *RemoteConfigUpdater) Stop() error { } func NewRemoteConfigUpdater(client kubeclient.Client, logger logr.Logger) *RemoteConfigUpdater { - return &RemoteConfigUpdater{ + r := &RemoteConfigUpdater{ kubeClient: client, logger: logger, + installerState: []*pbgo.PackageState{ + { + Package: "datadog-operator", + StableVersion: "0.0.1", + StableConfigVersion: "0.0.1", + }, + }, + remoteConfigFactory: defaultRCFactory, + } + r.subscriptions = []rcSubscription{ + {product: state.ProductAgentConfig, fn: r.agentConfigUpdateCallback}, + {product: state.ProductOrchestratorK8sCRDs, fn: r.crdConfigUpdateCallback}, } + return r } diff --git a/pkg/remoteconfig/updater_test.go b/pkg/remoteconfig/updater_test.go new file mode 100644 index 0000000000..fc9ce337d8 --- /dev/null +++ b/pkg/remoteconfig/updater_test.go @@ -0,0 +1,231 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package remoteconfig + +import ( + "errors" + "testing" + + pbgo "github.com/DataDog/datadog-agent/pkg/proto/pbgo/core" + "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/DataDog/datadog-operator/pkg/config" + "github.com/DataDog/datadog-operator/pkg/constants" +) + +type fakeRuntimeFactory struct { + runtimes []*fakeRuntime + failFor map[string]error +} + +type fakeRuntime struct { + apiKey string + service *fakeService + client *fakeClient +} + +type fakeService struct { + startCount int + stopCount int + stopErr error +} + +type fakeClient struct { + startCount int + closeCount int + subscriptions []string + installerState []*pbgo.PackageState + installerStateStartCount int +} + +func (f *fakeRuntimeFactory) factory(conf RcServiceConfiguration) (rcService, rcRuntimeClient, error) { + if err := f.failFor[conf.apiKey]; err != nil { + return nil, nil, err + } + runtime := &fakeRuntime{ + apiKey: conf.apiKey, + service: &fakeService{}, + client: &fakeClient{}, + } + f.runtimes = append(f.runtimes, runtime) + return runtime.service, runtime.client, nil +} + +func (f *fakeService) Start() { + f.startCount++ +} + +func (f *fakeService) Stop() error { + f.stopCount++ + return f.stopErr +} + +func (f *fakeClient) Start() { + f.startCount++ +} + +func (f *fakeClient) Close() { + f.closeCount++ +} + +func (f *fakeClient) Subscribe(product string, _ func(update map[string]state.RawConfig, applyStateCallback func(string, state.ApplyStatus))) { + f.subscriptions = append(f.subscriptions, product) +} + +func (f *fakeClient) GetInstallerState() []*pbgo.PackageState { + return f.installerState +} + +func (f *fakeClient) SetInstallerState(packages []*pbgo.PackageState) { + f.installerState = packages + f.installerStateStartCount = f.startCount +} + +func newTestUpdater(factory *fakeRuntimeFactory) *RemoteConfigUpdater { + r := NewRemoteConfigUpdater(nil, logr.Discard()) + r.remoteConfigFactory = factory.factory + return r +} + +func newTestCredentialManager(t *testing.T, apiKey string) *config.CredentialManager { + t.Helper() + t.Setenv(constants.DDAPIKey, apiKey) + t.Setenv(constants.DDAppKey, "app-key") + return config.NewCredentialManager(nil) +} + +func TestSetupFetchesCredentialsFromManager(t *testing.T) { + factory := &fakeRuntimeFactory{} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "setup-api-key") + + require.NoError(t, r.Setup(credsManager)) + require.Len(t, factory.runtimes, 1) + assert.Equal(t, "setup-api-key", factory.runtimes[0].apiKey) + assert.Equal(t, 1, factory.runtimes[0].service.startCount) + assert.Equal(t, 1, factory.runtimes[0].client.startCount) +} + +func TestSyncCredentialsRestartsRuntimeWhenAPIKeyChanges(t *testing.T) { + factory := &fakeRuntimeFactory{} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "old-api-key") + require.NoError(t, r.Setup(credsManager)) + + t.Setenv(constants.DDAPIKey, "new-api-key") + t.Setenv(constants.DDAppKey, "new-app-key") + require.NoError(t, credsManager.Refresh(logr.Discard())) + require.NoError(t, r.syncCredentials(credsManager)) + + require.Len(t, factory.runtimes, 2) + assert.Equal(t, "new-api-key", factory.runtimes[1].apiKey) + assert.Equal(t, 1, factory.runtimes[0].service.stopCount) + assert.Equal(t, 1, factory.runtimes[0].client.closeCount) + assert.Equal(t, 1, factory.runtimes[1].service.startCount) + assert.Equal(t, 1, factory.runtimes[1].client.startCount) +} + +func TestSyncCredentialsNoopsWhenAPIKeyUnchanged(t *testing.T) { + factory := &fakeRuntimeFactory{} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "same-api-key") + require.NoError(t, r.Setup(credsManager)) + + require.NoError(t, r.syncCredentials(credsManager)) + + require.Len(t, factory.runtimes, 1) + assert.Equal(t, 0, factory.runtimes[0].service.stopCount) + assert.Equal(t, 0, factory.runtimes[0].client.closeCount) +} + +func TestSyncCredentialsReturnsErrorWithoutStoppingActiveRuntimeWhenCredentialsCannotBeRead(t *testing.T) { + factory := &fakeRuntimeFactory{} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "old-api-key") + require.NoError(t, r.Setup(credsManager)) + + t.Setenv(constants.DDAPIKey, "") + t.Setenv(constants.DDAppKey, "") + credsManager = config.NewCredentialManager(nil) + + err := r.syncCredentials(credsManager) + require.Error(t, err) + require.Len(t, factory.runtimes, 1) + assert.Equal(t, 0, factory.runtimes[0].service.stopCount) + assert.Equal(t, 0, factory.runtimes[0].client.closeCount) +} + +func TestSyncCredentialsLeavesOldRuntimeActiveWhenReplacementCreationFails(t *testing.T) { + factory := &fakeRuntimeFactory{failFor: map[string]error{"new-api-key": errors.New("boom")}} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "old-api-key") + require.NoError(t, r.Setup(credsManager)) + + t.Setenv(constants.DDAPIKey, "new-api-key") + t.Setenv(constants.DDAppKey, "new-app-key") + require.NoError(t, credsManager.Refresh(logr.Discard())) + err := r.syncCredentials(credsManager) + + require.Error(t, err) + require.Len(t, factory.runtimes, 1) + assert.Equal(t, 0, factory.runtimes[0].service.stopCount) + assert.Equal(t, 0, factory.runtimes[0].client.closeCount) + assert.Equal(t, "old-api-key", r.activeAPIKey) +} + +func TestSyncCredentialsTreatsOldRuntimeCleanupErrorAsNonFatal(t *testing.T) { + factory := &fakeRuntimeFactory{} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "old-api-key") + require.NoError(t, r.Setup(credsManager)) + factory.runtimes[0].service.stopErr = errors.New("stop failed") + + t.Setenv(constants.DDAPIKey, "new-api-key") + t.Setenv(constants.DDAppKey, "new-app-key") + require.NoError(t, credsManager.Refresh(logr.Discard())) + require.NoError(t, r.syncCredentials(credsManager)) + + require.Len(t, factory.runtimes, 2) + assert.Equal(t, "new-api-key", r.activeAPIKey) + assert.Equal(t, factory.runtimes[1].service, r.rcService) + assert.Equal(t, factory.runtimes[1].client, r.rcClient) + assert.Equal(t, 1, factory.runtimes[0].service.stopCount) + assert.Equal(t, 1, factory.runtimes[0].client.closeCount) +} + +func TestSyncCredentialsPreservesSubscriptionsAndInstallerStateAcrossRestarts(t *testing.T) { + factory := &fakeRuntimeFactory{} + r := newTestUpdater(factory) + credsManager := newTestCredentialManager(t, "old-api-key") + require.NoError(t, r.Setup(credsManager)) + + r.Subscribe("fleet-product", func(map[string]state.RawConfig, func(string, state.ApplyStatus)) {}) + r.SetInstallerState([]*pbgo.PackageState{{ + Package: "fleet-package", + StableVersion: "1.2.3", + StableConfigVersion: "4.5.6", + }}) + + t.Setenv(constants.DDAPIKey, "new-api-key") + t.Setenv(constants.DDAppKey, "new-app-key") + require.NoError(t, credsManager.Refresh(logr.Discard())) + require.NoError(t, r.syncCredentials(credsManager)) + + require.Len(t, factory.runtimes, 2) + newClient := factory.runtimes[1].client + assert.ElementsMatch(t, []string{ + string(state.ProductAgentConfig), + string(state.ProductOrchestratorK8sCRDs), + "fleet-product", + }, newClient.subscriptions) + require.Len(t, newClient.installerState, 1) + assert.Equal(t, "fleet-package", newClient.installerState[0].Package) + assert.Equal(t, 0, newClient.installerStateStartCount) + assert.Equal(t, 1, newClient.startCount) +} From 340c3d2b2e781f4d97244f1deb448b462cfb4962 Mon Sep 17 00:00:00 2001 From: Mathew Estafanous Date: Mon, 4 May 2026 13:11:27 -0400 Subject: [PATCH 2/3] chore: cleanup DB after rc client rotation --- pkg/remoteconfig/updater.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pkg/remoteconfig/updater.go b/pkg/remoteconfig/updater.go index e345fa6e41..dc64005d7b 100644 --- a/pkg/remoteconfig/updater.go +++ b/pkg/remoteconfig/updater.go @@ -80,7 +80,7 @@ type RcServiceConfiguration struct { clusterName string telemetryReporter service.RcTelemetryReporter agentVersion string - rcDatabaseDir string + rcDatabaseFile string } // RCClient is the interface for subscribing to RC product updates. @@ -209,6 +209,7 @@ func (r *RemoteConfigUpdater) startRuntimeWithConfig(apiKey string, site string, oldSvc := r.rcService oldClient := r.rcClient + oldDBFile := r.serviceConf.rcDatabaseFile newClient.SetInstallerState(r.installerState) @@ -237,6 +238,11 @@ func (r *RemoteConfigUpdater) startRuntimeWithConfig(apiKey string, site string, if oldClient != nil { oldClient.Close() } + if oldDBFile != "" { + if err := os.Remove(oldDBFile); err != nil && !os.IsNotExist(err) { + r.logger.Error(err, "Failed to remove old Remote Configuration database file", "file", oldDBFile) + } + } return nil } @@ -269,8 +275,8 @@ func (r *RemoteConfigUpdater) newServiceConfiguration(apiKey, site, clusterName, clusterName: clusterName, telemetryReporter: dummyTelemetryReporter{}, // TODO fix when other values accepted - agentVersion: "7.50.0", - rcDatabaseDir: baseDir, + agentVersion: "7.50.0", + rcDatabaseFile: filepath.Join(baseDir, fmt.Sprintf("remote-config-%s.db", uuid.New())), }, nil } @@ -284,7 +290,7 @@ func defaultRCFactory(conf RcServiceConfiguration) (rcService, rcRuntimeClient, conf.telemetryReporter, conf.agentVersion, service.WithAPIKey(conf.apiKey), - service.WithDatabaseFileName(filepath.Join(conf.rcDatabaseDir, fmt.Sprintf("remote-config-%s.db", uuid.New()))), + service.WithDatabaseFileName(conf.rcDatabaseFile), service.WithDirectorRootOverride(conf.cfg.GetString("site"), conf.cfg.GetString("remote_configuration.director_root")), service.WithConfigRootOverride(conf.cfg.GetString("site"), conf.cfg.GetString("remote_configuration.config_root")), ) From ee40d09454a23a179166ee89fde029b746a0a55a Mon Sep 17 00:00:00 2001 From: Mathew Estafanous Date: Mon, 4 May 2026 13:22:58 -0400 Subject: [PATCH 3/3] chore: comments and mini refactor --- cmd/main.go | 2 +- pkg/remoteconfig/updater.go | 22 +++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 25b38b38c2..0a058ecef6 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -349,7 +349,7 @@ func run(opts *options) error { } if opts.remoteUpdatesEnabled { - if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater.Client(), opts.createControllerRevisions && opts.datadogAgentInternalEnabled); rcErr != nil { + if rcErr := setupFleetDaemon(setupLog, mgr, rcUpdater, opts.createControllerRevisions && opts.datadogAgentInternalEnabled); rcErr != nil { setupErrorf(setupLog, rcErr, "Unable to setup Fleet daemon") } } diff --git a/pkg/remoteconfig/updater.go b/pkg/remoteconfig/updater.go index dc64005d7b..ca778984e7 100644 --- a/pkg/remoteconfig/updater.go +++ b/pkg/remoteconfig/updater.go @@ -56,11 +56,15 @@ type RemoteConfigUpdater struct { type rcRuntimeFactory func(conf RcServiceConfiguration) (rcService, rcRuntimeClient, error) +// rcService abstracts the RC service for testability. In production this is *service.CoreAgentService. type rcService interface { Start() Stop() error } +// rcRuntimeClient abstracts the inner RC client that gets swapped on key rotation. +// In production this is *client.Client. The outer RemoteConfigUpdater implements RCClient +// as a stable wrapper so callers aren't aware of client replacement. type rcRuntimeClient interface { RCClient Start() @@ -90,11 +94,6 @@ type RCClient interface { SetInstallerState(packages []*pbgo.PackageState) } -// Client returns the underlying RC client. -func (r *RemoteConfigUpdater) Client() RCClient { - return r -} - // DatadogProductRemoteConfig is an interface for Datadog product remote configuration type DatadogProductRemoteConfig interface { // GetID returns the ID of the configuration @@ -181,20 +180,17 @@ func (r *RemoteConfigUpdater) Setup(credsManager *config.CredentialManager) erro return nil } -// startRuntime will start the remote configuration client and service. This must be called a held lock. +// startRuntime creates a new RC service and client for the given API key, swapping out any +// existing runtime. Must be called with lifecycleMu held. func (r *RemoteConfigUpdater) startRuntime(apiKey string) error { + r.logger.Info("Starting Remote Configuration client and service") + site := os.Getenv(constants.DDSite) // TODO support DD_URL as well clusterName := os.Getenv(constants.DDClusterName) directorRoot := os.Getenv("DD_REMOTE_CONFIGURATION_DIRECTOR_ROOT") configRoot := os.Getenv("DD_REMOTE_CONFIGURATION_CONFIG_ROOT") endpoint := os.Getenv("DD_REMOTE_CONFIGURATION_RC_DD_URL") - return r.startRuntimeWithConfig(apiKey, site, clusterName, directorRoot, configRoot, endpoint) -} - -func (r *RemoteConfigUpdater) startRuntimeWithConfig(apiKey string, site string, clusterName string, directorRoot string, configRoot string, endpoint string) error { - r.logger.Info("Starting Remote Configuration client and service") - serviceConf, err := r.newServiceConfiguration(apiKey, site, clusterName, directorRoot, configRoot, endpoint) if err != nil { r.logger.Error(err, "Failed to configure Remote Configuration service") @@ -227,7 +223,7 @@ func (r *RemoteConfigUpdater) startRuntimeWithConfig(apiKey string, site string, r.rcService = newService r.rcClient = newClient - // Clean up old service/client after the new ones are setup and swapped. + // Clean up old service/client after the new one is set up. if oldSvc != nil { if err := oldSvc.Stop(); err != nil { // The new runtime is already active; returning this error would leave