Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions artifacts/deploy/karmada-etcd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,35 @@ spec:
image: registry.k8s.io/etcd:3.6.6-0
imagePullPolicy: IfNotPresent
livenessProbe:
exec:
command:
- /bin/sh
- -ec
- 'etcdctl endpoint health --endpoints https://127.0.0.1:2379 --cacert /etc/karmada/pki/etcd-client/ca.crt --cert /etc/karmada/pki/etcd-client/tls.crt --key /etc/karmada/pki/etcd-client/tls.key'
httpGet:
path: /livez
port: 2381
scheme: HTTP
initialDelaySeconds: 60
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I observed that the value of initialDelaySeconds has been adjusted significantly. What is the reason for this adjustment?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested locally, everything is working fine, not sure why we set it to 600s before.

Copy link
Copy Markdown
Author

@vgt-rangehrn vgt-rangehrn May 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I started with the configuration from here and increased the initialDelaySeconds based on the copilot comment (and my own testing). On the other hand, there would probably be no harm in leaving it at 600.

timeoutSeconds: 5
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
Comment thread
vgt-rangehrn marked this conversation as resolved.
initialDelaySeconds: 600
periodSeconds: 60
readinessProbe:
httpGet:
path: /readyz
port: 2381
scheme: HTTP
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 10
failureThreshold: 30
ports:
- containerPort: 2379
name: client
protocol: TCP
- containerPort: 2380
name: server
protocol: TCP
Comment thread
vgt-rangehrn marked this conversation as resolved.
- containerPort: 2381
name: metrics
protocol: TCP
resources:
requests:
cpu: 500m
Expand All @@ -68,6 +80,8 @@ spec:
- http://0.0.0.0:2380
- --listen-client-urls
- https://0.0.0.0:2379
- --listen-metrics-urls
- http://0.0.0.0:2381
- --advertise-client-urls
- https://etcd-client.karmada-system.svc.cluster.local:2379
- --initial-cluster
Expand Down
30 changes: 22 additions & 8 deletions charts/karmada/templates/etcd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,25 @@ spec:
image: {{ template "karmada.internal.etcd.image" . }}
imagePullPolicy: {{ .Values.etcd.internal.image.pullPolicy }}
livenessProbe:
exec:
command:
- /bin/sh
- -ec
- 'etcdctl endpoint health --endpoints https://127.0.0.1:2379 --cacert /etc/kubernetes/pki/etcd/server-ca.crt --cert /etc/kubernetes/pki/etcd/karmada.crt --key /etc/kubernetes/pki/etcd/karmada.key'
httpGet:
path: /livez
port: 2381
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
Comment thread
vgt-rangehrn marked this conversation as resolved.
initialDelaySeconds: 600
periodSeconds: 60
readinessProbe:
httpGet:
path: /readyz
port: 2381
scheme: HTTP
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 10
failureThreshold: 30
env:
- name: KARMADA_ETCD_NAME
valueFrom:
Expand All @@ -70,6 +79,9 @@ spec:
- containerPort: 2380
name: server
protocol: TCP
Comment thread
vgt-rangehrn marked this conversation as resolved.
- containerPort: 2381
name: metrics
protocol: TCP
resources:
{{- toYaml .Values.etcd.internal.resources | nindent 12 }}
volumeMounts:
Expand All @@ -86,6 +98,8 @@ spec:
- http://0.0.0.0:2380
- --listen-client-urls
- https://0.0.0.0:2379
- --listen-metrics-urls
- http://0.0.0.0:2381
- --advertise-client-urls
- https://etcd-client.{{ include "karmada.namespace" . }}.svc.{{ .Values.clusterDomain }}:2379
- --initial-cluster
Expand Down
9 changes: 5 additions & 4 deletions operator/pkg/controlplane/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ func installKarmadaEtcd(client clientset.Interface, name, namespace string, cfg
}

etcdStatefulSetBytes, err := util.ParseTemplate(KarmadaEtcdStatefulSet, struct {
KarmadaInstanceName, StatefulSetName, Namespace, Image string
ImagePullPolicy, EtcdClientService, CertsSecretName string
InitialCluster, EtcdDataVolumeName, EtcdCipherSuites string
Replicas, EtcdListenClientPort, EtcdListenPeerPort int32
KarmadaInstanceName, StatefulSetName, Namespace, Image string
ImagePullPolicy, EtcdClientService, CertsSecretName string
InitialCluster, EtcdDataVolumeName, EtcdCipherSuites string
Replicas, EtcdListenClientPort, EtcdListenPeerPort, EtcdMetricsPort int32
}{
Comment on lines 69 to 74
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The anonymous struct used for template parsing is becoming quite large (13 fields). While the repository style guide suggests limiting function parameters to 5, the same principle of readability and maintainability applies to structs. Consider refactoring this into a named struct. Note that per repository rules, the struct name should be plural if it logically represents a collection of multiple items (e.g., WorkloadAffinityGroups).

References
  1. Function parameters should be limited to 5; refactor or encapsulate if exceeded. (link)
  2. A struct name should be plural if it logically represents a collection of multiple items, even if those items are distinct fields within the struct.

Copy link
Copy Markdown
Member

@XiShanYongYe-Chang XiShanYongYe-Chang May 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's okay if this PR isn't handled.

KarmadaInstanceName: name,
StatefulSetName: util.KarmadaEtcdName(name),
Expand All @@ -85,6 +85,7 @@ func installKarmadaEtcd(client clientset.Interface, name, namespace string, cfg
Replicas: *cfg.Replicas,
EtcdListenClientPort: constants.EtcdListenClientPort,
EtcdListenPeerPort: constants.EtcdListenPeerPort,
EtcdMetricsPort: constants.EtcdMetricsPort,
})
if err != nil {
return fmt.Errorf("error when parsing Etcd statefuelset template: %w", err)
Expand Down
29 changes: 21 additions & 8 deletions operator/pkg/controlplane/etcd/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ spec:
- --name=$(KARMADA_ETCD_NAME)
- --listen-client-urls=https://0.0.0.0:{{ .EtcdListenClientPort }}
- --listen-peer-urls=http://0.0.0.0:{{ .EtcdListenPeerPort }}
- --listen-metrics-urls=http://0.0.0.0:{{ .EtcdMetricsPort }}
- --advertise-client-urls=https://{{ .EtcdClientService }}.{{ .Namespace }}.svc.cluster.local:{{ .EtcdListenClientPort }}
Comment thread
vgt-rangehrn marked this conversation as resolved.
Comment on lines 53 to 56
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Binding etcd’s metrics/health listener to 0.0.0.0 over plain HTTP makes /metrics, /livez, and /readyz reachable from any pod that can reach the pod IP, which is a security posture change compared to the previous mTLS-only client port checks. If this endpoint only needs to be consumed by kubelet, consider mitigating exposure (e.g., a NetworkPolicy to restrict access in the namespace, or alternative probe mechanisms that don’t require an unauthenticated HTTP listener).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this shouldn't be an issue, but I'm open to other opinions.

- --initial-cluster={{ .InitialCluster }}
- --initial-cluster-state=new
Expand All @@ -70,23 +71,35 @@ spec:
apiVersion: v1
fieldPath: metadata.name
livenessProbe:
exec:
command:
- /bin/sh
- -ec
- etcdctl endpoint health --endpoints https://127.0.0.1:{{ .EtcdListenClientPort }} --cacert=/etc/karmada/pki/etcd/etcd-ca.crt --cert=/etc/karmada/pki/etcd/etcd-server.crt --key=/etc/karmada/pki/etcd/etcd-server.key
httpGet:
path: /livez
port: {{ .EtcdMetricsPort }}
scheme: HTTP
Comment thread
vgt-rangehrn marked this conversation as resolved.
initialDelaySeconds: 60
timeoutSeconds: 5
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
Comment on lines 73 to 82
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the etcd liveness probe from a very long initial delay/period (600s/60s) to a much more aggressive cadence (initialDelaySeconds=15, failure after ~30s). That can cause restart loops on slower starts (e.g., disk recovery or large data dirs). Consider keeping the previous timing values, or adding a startupProbe to cover slow initialization while keeping liveness strict for steady state.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my testing, etcd took something like 12s to be ready. I increased the initialDelay to 60s, so the pod would only be restarted if it wasn't ready after 90s. I think that should be fine, but I'm open to input.

initialDelaySeconds: 600
periodSeconds: 60
readinessProbe:
httpGet:
path: /readyz
port: {{ .EtcdMetricsPort }}
scheme: HTTP
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 10
failureThreshold: 30
ports:
- containerPort: {{ .EtcdListenClientPort }}
name: client
protocol: TCP
- containerPort: {{ .EtcdListenPeerPort }}
name: server
protocol: TCP
- containerPort: {{ .EtcdMetricsPort }}
name: metrics
protocol: TCP
volumeMounts:
- mountPath: /var/lib/etcd
name: {{ .EtcdDataVolumeName }}
Expand Down
1 change: 1 addition & 0 deletions pkg/karmadactl/cmdinit/kubernetes/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ func (i *CommandInitOption) defaultEtcdContainerCommand() []string {
fmt.Sprintf("--name=$(%s)", etcdEnvPodName),
fmt.Sprintf("--listen-peer-urls=http://$(%s):%v", etcdEnvPodIP, etcdContainerServerPort),
fmt.Sprintf("--listen-client-urls=https://$(%s):%v,http://127.0.0.1:%v", etcdEnvPodIP, etcdContainerClientPort, etcdContainerClientPort),
fmt.Sprintf("--listen-metrics-urls=http://$(%s):%v", etcdEnvPodIP, etcdContainerMetricsPort),
fmt.Sprintf("--advertise-client-urls=https://$(%s).%s.%s.svc.%s:%v", etcdEnvPodName, etcdStatefulSetAndServiceName, i.Namespace, i.HostClusterDomain, etcdContainerClientPort),
fmt.Sprintf("--initial-cluster=%s", strings.TrimRight(etcdClusterConfig.String(), ",")),
"--initial-cluster-state=new",
Expand Down
53 changes: 30 additions & 23 deletions pkg/karmadactl/cmdinit/kubernetes/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ limitations under the License.
package kubernetes

import (
"fmt"
"strings"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/component-base/cli/flag"
"k8s.io/utils/ptr"
)
Expand All @@ -36,6 +36,8 @@ const (
etcdContainerClientPort = 2379
etcdContainerServerPortName = "server"
etcdContainerServerPort = 2380
etcdContainerMetricsPortName = "metrics"
etcdContainerMetricsPort = 2381
etcdContainerDataVolumeMountName = "etcd-data"
etcdContainerDataVolumeMountPath = "/var/lib/karmada-etcd"
etcdContainerConfigVolumeMountName = "etcd-config"
Expand Down Expand Up @@ -142,34 +144,34 @@ func (i *CommandInitOption) makeETCDStatefulSet() *appsv1.StatefulSet {
}

// Probes
livenesProbe := &corev1.Probe{
livenessProbe := &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/sh",
"-ec",
fmt.Sprintf("etcdctl endpoint health --endpoints http://127.0.0.1:%v", etcdContainerClientPort),
},
HTTPGet: &corev1.HTTPGetAction{
Path: "/livez",
Port: intstr.FromInt(etcdContainerMetricsPort),
Scheme: corev1.URISchemeHTTP,
},
},
InitialDelaySeconds: 15,
FailureThreshold: 3,
PeriodSeconds: 60,
InitialDelaySeconds: 60,
TimeoutSeconds: 5,
PeriodSeconds: 10,
SuccessThreshold: 1,
FailureThreshold: 3,
}
/* readinesProbe := &corev1.Probe{
Handler: corev1.Handler{
TCPSocket: &corev1.TCPSocketAction{
Port: intstr.IntOrString{
IntVal: etcdContainerClientPort,
},
readinessProbe := &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/readyz",
Port: intstr.FromInt(etcdContainerMetricsPort),
Scheme: corev1.URISchemeHTTP,
},
},
InitialDelaySeconds: 5,
FailureThreshold: 3,
PeriodSeconds: 30,
InitialDelaySeconds: 10,
TimeoutSeconds: 5,
}*/
PeriodSeconds: 5,
SuccessThreshold: 1,
FailureThreshold: 30,
}

// etcd Container
podSpec := corev1.PodSpec{
Expand Down Expand Up @@ -213,6 +215,11 @@ func (i *CommandInitOption) makeETCDStatefulSet() *appsv1.StatefulSet {
ContainerPort: etcdContainerServerPort,
Protocol: corev1.ProtocolTCP,
},
{
Name: etcdContainerMetricsPortName,
ContainerPort: etcdContainerMetricsPort,
Protocol: corev1.ProtocolTCP,
},
},
VolumeMounts: []corev1.VolumeMount{
{
Expand All @@ -231,8 +238,8 @@ func (i *CommandInitOption) makeETCDStatefulSet() *appsv1.StatefulSet {
MountPath: karmadaCertsVolumeMountPath,
},
},
LivenessProbe: livenesProbe,
//ReadinessProbe: readinesProbe,
LivenessProbe: livenessProbe,
ReadinessProbe: readinessProbe,
Env: []corev1.EnvVar{
{
Name: etcdEnvPodName,
Expand Down
Loading