From a1f37c72f250b080d9a7bd63f9f669232898861a Mon Sep 17 00:00:00 2001 From: Chris Doherty Date: Thu, 2 Apr 2026 21:46:54 -0500 Subject: [PATCH] feat: expose AMI cache TTL as runtime flags Operators running large fleets can generate significant DescribeImages API call volume due to frequent AMI reconciles. This change makes the AMI cache TTL configurable so operators can tune them for their workload without rebuilding. --ami-cache-ttl (env: AMI_CACHE_TTL, default: 1m) Default preserve existing behaviour. --- kwok/operator/operator.go | 2 +- pkg/cache/cache.go | 5 +++++ pkg/operator/operator.go | 2 +- pkg/operator/options/options.go | 3 +++ pkg/operator/options/options_validation.go | 9 +++++++++ pkg/operator/options/suite_test.go | 12 +++++++++++- pkg/test/environment.go | 2 +- pkg/test/options.go | 3 +++ website/content/en/preview/reference/settings.md | 1 + 9 files changed, 35 insertions(+), 4 deletions(-) diff --git a/kwok/operator/operator.go b/kwok/operator/operator.go index 52dc9a485f8a..f5350ae04941 100644 --- a/kwok/operator/operator.go +++ b/kwok/operator/operator.go @@ -146,7 +146,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont // the previously resolved value will be used. lo.Must0(versionProvider.UpdateVersion(ctx)) ssmProvider := ssmp.NewDefaultProvider(ssm.NewFromConfig(cfg), ssmCache) - amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)) + amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(options.FromContext(ctx).AMICacheTTL, awscache.DefaultCleanupInterval)) placementGroupProvider := placementgroup.NewProvider( ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go index cc1d5ca13c49..2178c07b88ed 100644 --- a/pkg/cache/cache.go +++ b/pkg/cache/cache.go @@ -24,6 +24,11 @@ const ( // AWS APIs, which can have a serious impact on performance and scalability. // DO NOT CHANGE THIS VALUE WITHOUT DUE CONSIDERATION DefaultTTL = time.Minute + // AMICacheTTL is the default TTL for cached AMI discovery results. Operators + // can override this at runtime via the --ami-cache-ttl flag. Setting the cache + // TTL >= the requeue interval ensures scheduled reconciles are served from cache + // rather than re-querying the EC2 API on every reconcile. + AMICacheTTL = time.Minute // PlacementGroupAvailabilityTTL is the TTL for resolved placement group data. PlacementGroupAvailabilityTTL = 24 * time.Hour // UnavailableOfferingsTTL is the time before offerings that were marked as unavailable diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 62fae13133cb..cee730742af5 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -155,7 +155,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont // the previously resolved value will be used. lo.Must0(versionProvider.UpdateVersion(ctx)) ssmProvider := ssmp.NewDefaultProvider(ssm.NewFromConfig(cfg), ssmCache) - amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)) + amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(options.FromContext(ctx).AMICacheTTL, awscache.DefaultCleanupInterval)) placementGroupProvider := placementgroup.NewProvider( ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), diff --git a/pkg/operator/options/options.go b/pkg/operator/options/options.go index 3abe9f99c31a..83f86bf5850b 100644 --- a/pkg/operator/options/options.go +++ b/pkg/operator/options/options.go @@ -20,6 +20,7 @@ import ( "flag" "fmt" "os" + "time" coreoptions "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/utils/env" @@ -43,6 +44,7 @@ type Options struct { InterruptionQueue string ReservedENIs int DisableDryRun bool + AMICacheTTL time.Duration } func (o *Options) AddFlags(fs *coreoptions.FlagSet) { @@ -55,6 +57,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) { fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.") fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.") fs.BoolVarWithEnv(&o.DisableDryRun, "disable-dry-run", "DISABLE_DRY_RUN", false, "If true, then disable dry run validation for EC2NodeClasses.") + fs.DurationVar(&o.AMICacheTTL, "ami-cache-ttl", env.WithDefaultDuration("AMI_CACHE_TTL", time.Minute), "TTL for cached AMI discovery results.") } func (o *Options) Parse(fs *coreoptions.FlagSet, args ...string) error { diff --git a/pkg/operator/options/options_validation.go b/pkg/operator/options/options_validation.go index 9bf117ac0339..f8a0c17e1a6c 100644 --- a/pkg/operator/options/options_validation.go +++ b/pkg/operator/options/options_validation.go @@ -28,6 +28,7 @@ func (o *Options) Validate() error { o.validateVMMemoryOverheadPercent(), o.validateReservedENIs(), o.validateRequiredFields(), + o.validateAMICacheTTL(), ) } @@ -64,3 +65,11 @@ func (o *Options) validateRequiredFields() error { } return nil } + +func (o *Options) validateAMICacheTTL() error { + if o.AMICacheTTL <= 0 { + return fmt.Errorf("ami-cache-ttl must be positive") + } + return nil +} + diff --git a/pkg/operator/options/suite_test.go b/pkg/operator/options/suite_test.go index 17ae6d2617f3..55cec63f973d 100644 --- a/pkg/operator/options/suite_test.go +++ b/pkg/operator/options/suite_test.go @@ -19,6 +19,7 @@ import ( "flag" "os" "testing" + "time" "github.com/samber/lo" coreoptions "sigs.k8s.io/karpenter/pkg/operator/options" @@ -63,7 +64,8 @@ var _ = Describe("Options", func() { "--vm-memory-overhead-percent", "0.1", "--interruption-queue", "env-cluster", "--reserved-enis", "10", - "--disable-dry-run") + "--disable-dry-run", + "--ami-cache-ttl", "15m") Expect(err).ToNot(HaveOccurred()) expectOptionsEqual(opts, test.Options(test.OptionsFields{ ClusterCABundle: lo.ToPtr("env-bundle"), @@ -74,6 +76,7 @@ var _ = Describe("Options", func() { InterruptionQueue: lo.ToPtr("env-cluster"), ReservedENIs: lo.ToPtr(10), DisableDryRun: lo.ToPtr(true), + AMICacheTTL: lo.ToPtr(15 * time.Minute), })) }) It("should correctly fallback to env vars when CLI flags aren't set", func() { @@ -85,6 +88,7 @@ var _ = Describe("Options", func() { os.Setenv("INTERRUPTION_QUEUE", "env-cluster") os.Setenv("RESERVED_ENIS", "10") os.Setenv("DISABLE_DRY_RUN", "false") + os.Setenv("AMI_CACHE_TTL", "15m") // Add flags after we set the environment variables so that the parsing logic correctly refers // to the new environment variable values @@ -100,6 +104,7 @@ var _ = Describe("Options", func() { InterruptionQueue: lo.ToPtr("env-cluster"), ReservedENIs: lo.ToPtr(10), DisableDryRun: lo.ToPtr(false), + AMICacheTTL: lo.ToPtr(15 * time.Minute), })) }) @@ -123,6 +128,10 @@ var _ = Describe("Options", func() { err := opts.Parse(fs, "--cluster-name", "test-cluster", "--reserved-enis", "-1") Expect(err).To(HaveOccurred()) }) + It("should fail when ami-cache-ttl is zero", func() { + err := opts.Parse(fs, "--cluster-name", "test-cluster", "--ami-cache-ttl", "0") + Expect(err).To(HaveOccurred()) + }) }) }) @@ -136,4 +145,5 @@ func expectOptionsEqual(optsA *options.Options, optsB *options.Options) { Expect(optsA.InterruptionQueue).To(Equal(optsB.InterruptionQueue)) Expect(optsA.ReservedENIs).To(Equal(optsB.ReservedENIs)) Expect(optsA.DisableDryRun).To(Equal(optsB.DisableDryRun)) + Expect(optsA.AMICacheTTL).To(Equal(optsB.AMICacheTTL)) } diff --git a/pkg/test/environment.go b/pkg/test/environment.go index 36ec27492b21..237c9913dbd6 100644 --- a/pkg/test/environment.go +++ b/pkg/test/environment.go @@ -120,7 +120,7 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment iamapi := fake.NewIAMAPI() // cache - amiCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) + amiCache := cache.New(awscache.AMICacheTTL, awscache.DefaultCleanupInterval) ec2Cache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) instanceTypeCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) instanceCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) diff --git a/pkg/test/options.go b/pkg/test/options.go index cbb238682626..9212edbdf352 100644 --- a/pkg/test/options.go +++ b/pkg/test/options.go @@ -16,6 +16,7 @@ package test import ( "fmt" + "time" "github.com/imdario/mergo" "github.com/samber/lo" @@ -33,6 +34,7 @@ type OptionsFields struct { InterruptionQueue *string ReservedENIs *int DisableDryRun *bool + AMICacheTTL *time.Duration } func Options(overrides ...OptionsFields) *options.Options { @@ -52,5 +54,6 @@ func Options(overrides ...OptionsFields) *options.Options { InterruptionQueue: lo.FromPtrOr(opts.InterruptionQueue, ""), ReservedENIs: lo.FromPtrOr(opts.ReservedENIs, 0), DisableDryRun: lo.FromPtrOr(opts.DisableDryRun, false), + AMICacheTTL: lo.FromPtrOr(opts.AMICacheTTL, time.Minute), } } diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md index 980f5ae4641b..c266a0cf601f 100644 --- a/website/content/en/preview/reference/settings.md +++ b/website/content/en/preview/reference/settings.md @@ -12,6 +12,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | Environment Variable | CLI Flag | Description | |--|--|--| +| AMI_CACHE_TTL | \-\-ami-cache-ttl | TTL for cached AMI discovery results. (default = 1m0s)| | BATCH_IDLE_DURATION | \-\-batch-idle-duration | The maximum amount of time with no new pending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. (default = 1s)| | BATCH_MAX_DURATION | \-\-batch-max-duration | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. (default = 10s)| | CLUSTER_CA_BUNDLE | \-\-cluster-ca-bundle | Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration.|