Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ To learn more about active deprecations, we recommend checking [GitHub Discussio
- **General**: Check updated status for Fallback condition instead of ScaledObject ([#7488](https://github.com/kedacore/keda/issues/7488))
- **General**: Fix int64 overflow in milli-quantity conversion for very large metric values ([#7441](https://github.com/kedacore/keda/issues/7441))
- **General**: Fix ScaledObject admission webhook to return validation error from `verifyReplicaCount`, preventing invalid ScaledObjects from being created ([#5954](https://github.com/kedacore/keda/issues/5954))
- **General**: Guard `GetCurrentReplicas` against nil `Status.ScaleTargetGVKR` to prevent operator panics under the cache race documented in ([#4389](https://github.com/kedacore/keda/issues/4389)) / ([#4955](https://github.com/kedacore/keda/issues/4955))
- **Azure Data Explorer Scaler**: Remove clientSecretFromEnv support ([#7554](https://github.com/kedacore/keda/pull/7554))
- **Cron Scaler**: Fix metric name generation so cron expressions with comma-separated values no longer produce invalid metric names ([#7448](https://github.com/kedacore/keda/issues/7448))
- **Forgejo Scaler**: Limit HTTP error response logging ([#7469](https://github.com/kedacore/keda/pull/7469))
Expand Down
46 changes: 34 additions & 12 deletions pkg/scaling/resolver/scale_resolvers.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,28 @@ func isSecretAccessRestricted(logger logr.Logger) bool {
return boolFalse
}

// ensureScaleTargetGVKR returns scaledObject unchanged if its Status.ScaleTargetGVKR is
// already populated. If it is nil (informer-cache race, see issues #4389 / #4955) the
// function re-fetches the object from the API server and returns the fresh copy. If the
// GVKR is still nil after re-fetch, an error is returned so callers never dereference a
// nil pointer.
func ensureScaleTargetGVKR(ctx context.Context, kubeClient client.Client, scaledObject *kedav1alpha1.ScaledObject) (*kedav1alpha1.ScaledObject, error) {
if scaledObject.Status.ScaleTargetGVKR != nil {
return scaledObject, nil
}
fresh := &kedav1alpha1.ScaledObject{}
if err := kubeClient.Get(ctx, types.NamespacedName{Name: scaledObject.Name, Namespace: scaledObject.Namespace}, fresh); err != nil {
log.Error(err, "failed to get ScaledObject", "name", scaledObject.Name, "namespace", scaledObject.Namespace)
return nil, err
}
if fresh.Status.ScaleTargetGVKR == nil {
err := fmt.Errorf("failed to get ScaledObject.Status.ScaleTargetGVKR, probably invalid ScaledObject cache")
log.Error(err, "ScaleTargetGVKR still nil after re-fetch", "scaledObject.Name", fresh.Name, "scaledObject.Namespace", fresh.Namespace)
return nil, err
}
return fresh, nil
}

// ResolveScaleTargetPodSpec for given scalableObject inspects the scale target workload,
// which could be almost any k8s resource (Deployment, StatefulSet, CustomResource...)
// and for the given resource returns *corev1.PodTemplateSpec and a name of the container
Expand All @@ -103,18 +125,9 @@ func ResolveScaleTargetPodSpec(ctx context.Context, kubeClient client.Client, sc
// trying to prevent operator crashes, due to some race condition, sometimes obj.Status.ScaleTargetGVKR is nil
// see https://github.com/kedacore/keda/issues/4389
// Tracking issue: https://github.com/kedacore/keda/issues/4955
if obj.Status.ScaleTargetGVKR == nil {
scaledObject := &kedav1alpha1.ScaledObject{}
err := kubeClient.Get(ctx, types.NamespacedName{Name: obj.Name, Namespace: obj.Namespace}, scaledObject)
if err != nil {
log.Error(err, "failed to get ScaledObject", "name", obj.Name, "namespace", obj.Namespace)
return nil, "", err
}
obj = scaledObject
}
if obj.Status.ScaleTargetGVKR == nil {
err := fmt.Errorf("failed to get ScaledObject.Status.ScaleTargetGVKR, probably invalid ScaledObject cache")
log.Error(err, "failed to get ScaledObject.Status.ScaleTargetGVKR, probably invalid ScaledObject cache", "scaledObject.Name", obj.Name, "scaledObject.Namespace", obj.Namespace)
var err error
obj, err = ensureScaleTargetGVKR(ctx, kubeClient, obj)
if err != nil {
return nil, "", err
}

Expand Down Expand Up @@ -724,6 +737,15 @@ func resolveServiceAccountAnnotation(ctx context.Context, client client.Client,

// GetCurrentReplicas returns the current replica count for a ScaledObject
func GetCurrentReplicas(ctx context.Context, client client.Client, scaleClient scale.ScalesGetter, scaledObject *kedav1alpha1.ScaledObject) (int32, error) {
// trying to prevent operator crashes, due to some race condition, sometimes scaledObject.Status.ScaleTargetGVKR is nil
// see https://github.com/kedacore/keda/issues/4389
// Tracking issue: https://github.com/kedacore/keda/issues/4955
var err error
scaledObject, err = ensureScaleTargetGVKR(ctx, client, scaledObject)
if err != nil {
return 0, err
}

targetName := scaledObject.Spec.ScaleTargetRef.Name
targetGVKR := scaledObject.Status.ScaleTargetGVKR

Expand Down
97 changes: 97 additions & 0 deletions pkg/scaling/resolver/scale_resolvers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/assert"
"go.uber.org/mock/gomock"
appsv1 "k8s.io/api/apps/v1"
authv1 "k8s.io/api/authentication/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -1124,3 +1125,99 @@ func TestReadAuthParamsFromFile_Errors(t *testing.T) {
assert.Error(t, err)
assert.Contains(t, err.Error(), "filePath must be relative")
}

// TestGetCurrentReplicas_NilScaleTargetGVKR verifies that GetCurrentReplicas
// does not panic when scaledObject.Status.ScaleTargetGVKR is nil, which can
// happen due to the cache race documented at
// https://github.com/kedacore/keda/issues/4389 (tracking #4955).
func TestGetCurrentReplicas_NilScaleTargetGVKR(t *testing.T) {
if err := appsv1.AddToScheme(scheme.Scheme); err != nil {
t.Fatalf("failed to add appsv1 to scheme: %v", err)
}
if err := kedav1alpha1.AddToScheme(scheme.Scheme); err != nil {
t.Fatalf("failed to add kedav1alpha1 to scheme: %v", err)
}

const soName = "so-nil-gvkr"
const soNamespace = "test-ns"
const deploymentName = "target-dep"
replicas := int32(3)

populatedGVKR := &kedav1alpha1.GroupVersionKindResource{
Group: "apps",
Version: "v1",
Kind: "Deployment",
}

tests := []struct {
name string
existing []runtime.Object
wantReplicas int32
wantErr bool
wantErrMsg string
}{
{
name: "nil GVKR on input, refetch returns populated GVKR (Deployment path)",
existing: []runtime.Object{
&kedav1alpha1.ScaledObject{
ObjectMeta: metav1.ObjectMeta{Name: soName, Namespace: soNamespace},
Spec: kedav1alpha1.ScaledObjectSpec{
ScaleTargetRef: &kedav1alpha1.ScaleTarget{Name: deploymentName},
},
Status: kedav1alpha1.ScaledObjectStatus{ScaleTargetGVKR: populatedGVKR},
},
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: soNamespace},
Spec: appsv1.DeploymentSpec{Replicas: &replicas},
},
},
wantReplicas: replicas,
},
{
name: "nil GVKR on input, refetch also has nil GVKR",
existing: []runtime.Object{
&kedav1alpha1.ScaledObject{
ObjectMeta: metav1.ObjectMeta{Name: soName, Namespace: soNamespace},
Spec: kedav1alpha1.ScaledObjectSpec{
ScaleTargetRef: &kedav1alpha1.ScaleTarget{Name: deploymentName},
},
},
},
wantErr: true,
wantErrMsg: "probably invalid ScaledObject cache",
},
{
name: "nil GVKR on input, refetch fails (ScaledObject not in cache)",
existing: []runtime.Object{},
wantErr: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
kubeClient := fake.NewClientBuilder().
WithScheme(scheme.Scheme).
WithRuntimeObjects(tt.existing...).
Build()

input := &kedav1alpha1.ScaledObject{
ObjectMeta: metav1.ObjectMeta{Name: soName, Namespace: soNamespace},
Spec: kedav1alpha1.ScaledObjectSpec{
ScaleTargetRef: &kedav1alpha1.ScaleTarget{Name: deploymentName},
},
}

got, err := GetCurrentReplicas(context.Background(), kubeClient, nil, input)

if tt.wantErr {
assert.Error(t, err, "expected error")
if tt.wantErrMsg != "" && err != nil {
assert.Contains(t, err.Error(), tt.wantErrMsg)
}
return
}
assert.NoError(t, err)
assert.Equal(t, tt.wantReplicas, got)
})
}
}
Loading