-
Notifications
You must be signed in to change notification settings - Fork 53
Prometheus Metrics for SandboxManager #270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| package proxy | ||
|
|
||
| import ( | ||
| "github.com/prometheus/client_golang/prometheus" | ||
| "sigs.k8s.io/controller-runtime/pkg/metrics" | ||
| ) | ||
|
|
||
| var ( | ||
| // RoutesTotal tracks the current number of routes in the proxy routing table. | ||
| RoutesTotal = prometheus.NewGauge( | ||
| prometheus.GaugeOpts{ | ||
| Name: "sandboxmanager_routes_total", | ||
| Help: "Total number of routes currently managed", | ||
| }, | ||
| ) | ||
|
|
||
| // PeersTotal tracks the current number of peers connected. | ||
| PeersTotal = prometheus.NewGauge( | ||
| prometheus.GaugeOpts{ | ||
| Name: "sandboxmanager_peers_total", | ||
| Help: "Total number of peers currently connected", | ||
| }, | ||
| ) | ||
| ) | ||
|
|
||
| func init() { | ||
| metrics.Registry.MustRegister(RoutesTotal, PeersTotal) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| package proxy | ||
|
|
||
| import ( | ||
| "context" | ||
| "testing" | ||
|
|
||
| "github.com/prometheus/client_golang/prometheus/testutil" | ||
| "github.com/stretchr/testify/assert" | ||
| "k8s.io/apimachinery/pkg/types" | ||
|
|
||
| "github.com/openkruise/agents/pkg/sandbox-manager/config" | ||
| ) | ||
|
|
||
| func TestRoutesTotal_IncrementOnNewRoute(t *testing.T) { | ||
| s := NewServer(nil, nil, config.SandboxManagerOptions{}) | ||
| ctx := context.Background() | ||
|
|
||
| before := testutil.ToFloat64(RoutesTotal) | ||
| s.SetRoute(ctx, Route{ID: "sb-1", IP: "1.2.3.4", UID: types.UID("uid-1"), ResourceVersion: "1"}) | ||
| after := testutil.ToFloat64(RoutesTotal) | ||
|
|
||
| assert.Equal(t, float64(1), after-before, "RoutesTotal should increment by 1 on new route") | ||
| } | ||
|
|
||
| func TestRoutesTotal_NoIncrementOnUpdate(t *testing.T) { | ||
| s := NewServer(nil, nil, config.SandboxManagerOptions{}) | ||
| ctx := context.Background() | ||
|
|
||
| s.SetRoute(ctx, Route{ID: "sb-2", IP: "1.2.3.4", UID: types.UID("uid-2"), ResourceVersion: "1"}) | ||
| before := testutil.ToFloat64(RoutesTotal) | ||
|
|
||
| // Update same route with newer version | ||
| s.SetRoute(ctx, Route{ID: "sb-2", IP: "5.6.7.8", UID: types.UID("uid-2"), ResourceVersion: "2"}) | ||
| after := testutil.ToFloat64(RoutesTotal) | ||
|
|
||
| assert.Equal(t, float64(0), after-before, "RoutesTotal should not increment on route update") | ||
| } | ||
|
|
||
| func TestRoutesTotal_DecrementOnDelete(t *testing.T) { | ||
| s := NewServer(nil, nil, config.SandboxManagerOptions{}) | ||
| ctx := context.Background() | ||
|
|
||
| s.SetRoute(ctx, Route{ID: "sb-3", IP: "1.2.3.4", UID: types.UID("uid-3"), ResourceVersion: "1"}) | ||
| before := testutil.ToFloat64(RoutesTotal) | ||
|
|
||
| s.DeleteRoute("sb-3") | ||
| after := testutil.ToFloat64(RoutesTotal) | ||
|
|
||
| assert.Equal(t, float64(-1), after-before, "RoutesTotal should decrement by 1 on delete") | ||
| } | ||
|
|
||
| func TestRoutesTotal_NoDecrementOnDeleteNonExistent(t *testing.T) { | ||
| s := NewServer(nil, nil, config.SandboxManagerOptions{}) | ||
|
|
||
| before := testutil.ToFloat64(RoutesTotal) | ||
| s.DeleteRoute("non-existent") | ||
| after := testutil.ToFloat64(RoutesTotal) | ||
|
|
||
| assert.Equal(t, float64(0), after-before, "RoutesTotal should not change when deleting non-existent route") | ||
| } | ||
|
|
||
| func TestPeersTotal_SetOnSyncRouteWithPeers(t *testing.T) { | ||
| mp := newMockPeers() | ||
| s := NewServer(nil, mp, config.SandboxManagerOptions{}) | ||
|
|
||
| // No peers | ||
| _ = s.SyncRouteWithPeers(Route{ID: "sb-1", IP: "1.2.3.4", ResourceVersion: "1"}) | ||
| assert.Equal(t, float64(0), testutil.ToFloat64(PeersTotal)) | ||
| } |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -18,23 +18,36 @@ func (m *SandboxManager) ClaimSandbox(ctx context.Context, opts infra.ClaimSandb | |||||||||||||||||||||||||
| if !m.infra.HasTemplate(opts.Template) { | ||||||||||||||||||||||||||
| // Requirement: Track failure in API layer | ||||||||||||||||||||||||||
| SandboxCreationResponses.WithLabelValues("failure").Inc() | ||||||||||||||||||||||||||
| SandboxClaimTotal.WithLabelValues("failure", "").Inc() | ||||||||||||||||||||||||||
| return nil, errors.NewError(errors.ErrorNotFound, fmt.Sprintf("template %s not found", opts.Template)) | ||||||||||||||||||||||||||
|
Comment on lines
18
to
22
|
||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| sandbox, metrics, err := m.infra.ClaimSandbox(ctx, opts) | ||||||||||||||||||||||||||
| sandbox, claimMetrics, err := m.infra.ClaimSandbox(ctx, opts) | ||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||
| log.Error(err, "failed to claim sandbox", "metrics", metrics.String()) | ||||||||||||||||||||||||||
| log.Error(err, "failed to claim sandbox", "metrics", claimMetrics.String()) | ||||||||||||||||||||||||||
| // Requirement: Track failure in API layer | ||||||||||||||||||||||||||
| SandboxCreationResponses.WithLabelValues("failure").Inc() | ||||||||||||||||||||||||||
| SandboxClaimTotal.WithLabelValues("failure", "").Inc() | ||||||||||||||||||||||||||
| return nil, errors.NewError(errors.ErrorInternal, fmt.Sprintf("failed to claim sandbox: %v", err)) | ||||||||||||||||||||||||||
|
Comment on lines
25
to
30
|
||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Success: Record metrics | ||||||||||||||||||||||||||
| SandboxCreationResponses.WithLabelValues("success").Inc() | ||||||||||||||||||||||||||
| // Requirement: Only measure the latency when no error exists | ||||||||||||||||||||||||||
| SandboxCreationLatency.Observe(float64(metrics.Total.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCreationLatency.Observe(float64(claimMetrics.Total.Milliseconds())) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Claim-specific metrics | ||||||||||||||||||||||||||
| SandboxClaimDuration.Observe(float64(claimMetrics.Total.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxClaimTotal.WithLabelValues("success", string(claimMetrics.LockType)).Inc() | ||||||||||||||||||||||||||
| SandboxClaimRetries.Observe(float64(claimMetrics.Retries)) | ||||||||||||||||||||||||||
| SandboxClaimStageDuration.WithLabelValues("wait").Observe(float64(claimMetrics.Wait.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxClaimStageDuration.WithLabelValues("retry_cost").Observe(float64(claimMetrics.RetryCost.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxClaimStageDuration.WithLabelValues("pick_and_lock").Observe(float64(claimMetrics.PickAndLock.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxClaimStageDuration.WithLabelValues("wait_ready").Observe(float64(claimMetrics.WaitReady.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxClaimStageDuration.WithLabelValues("init_runtime").Observe(float64(claimMetrics.InitRuntime.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxClaimStageDuration.WithLabelValues("csi_mount").Observe(float64(claimMetrics.CSIMount.Milliseconds())) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| state, reason := sandbox.GetState() | ||||||||||||||||||||||||||
| log.Info("sandbox claimed", "sandbox", klog.KObj(sandbox), "metrics", metrics.String(), "state", state, "reason", reason) | ||||||||||||||||||||||||||
| log.Info("sandbox claimed", "sandbox", klog.KObj(sandbox), "metrics", claimMetrics.String(), "state", state, "reason", reason) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Sync route without refresh since sandbox was just claimed and state is already up-to-date | ||||||||||||||||||||||||||
| if err = m.syncRoute(ctx, sandbox, false); err != nil { | ||||||||||||||||||||||||||
|
|
@@ -45,19 +58,30 @@ func (m *SandboxManager) ClaimSandbox(ctx context.Context, opts infra.ClaimSandb | |||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| func (m *SandboxManager) CloneSandbox(ctx context.Context, opts infra.CloneSandboxOptions) (infra.Sandbox, error) { | ||||||||||||||||||||||||||
| log := klog.FromContext(ctx) | ||||||||||||||||||||||||||
| sandbox, metrics, err := m.infra.CloneSandbox(ctx, opts) | ||||||||||||||||||||||||||
| sandbox, cloneMetrics, err := m.infra.CloneSandbox(ctx, opts) | ||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||
| log.Error(err, "failed to clone sandbox", "metrics", metrics) | ||||||||||||||||||||||||||
| log.Error(err, "failed to clone sandbox", "metrics", cloneMetrics) | ||||||||||||||||||||||||||
| SandboxCreationResponses.WithLabelValues("failure").Inc() | ||||||||||||||||||||||||||
| SandboxCloneTotal.WithLabelValues("failure").Inc() | ||||||||||||||||||||||||||
| return nil, errors.NewError(errors.ErrorInternal, fmt.Sprintf("failed to clone sandbox: %v", err)) | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| // Success: Record metrics | ||||||||||||||||||||||||||
| SandboxCreationResponses.WithLabelValues("success").Inc() | ||||||||||||||||||||||||||
| // Requirement: Only measure the latency when no error exists | ||||||||||||||||||||||||||
| SandboxCreationLatency.Observe(float64(metrics.Total.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCreationLatency.Observe(float64(cloneMetrics.Total.Milliseconds())) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Clone-specific metrics | ||||||||||||||||||||||||||
| SandboxCloneDuration.Observe(float64(cloneMetrics.Total.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCloneTotal.WithLabelValues("success").Inc() | ||||||||||||||||||||||||||
| SandboxCloneStageDuration.WithLabelValues("wait").Observe(float64(cloneMetrics.Wait.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCloneStageDuration.WithLabelValues("get_template").Observe(float64(cloneMetrics.GetTemplate.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCloneStageDuration.WithLabelValues("create_sandbox").Observe(float64(cloneMetrics.CreateSandbox.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCloneStageDuration.WithLabelValues("wait_ready").Observe(float64(cloneMetrics.WaitReady.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCloneStageDuration.WithLabelValues("init_runtime").Observe(float64(cloneMetrics.InitRuntime.Milliseconds())) | ||||||||||||||||||||||||||
| SandboxCloneStageDuration.WithLabelValues("csi_mount").Observe(float64(cloneMetrics.CSIMount.Milliseconds())) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| state, reason := sandbox.GetState() | ||||||||||||||||||||||||||
| log.Info("sandbox cloned", "sandbox", klog.KObj(sandbox), "metrics", metrics.String(), "state", state, "reason", reason) | ||||||||||||||||||||||||||
| log.Info("sandbox cloned", "sandbox", klog.KObj(sandbox), "metrics", cloneMetrics.String(), "state", state, "reason", reason) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Sync route without refresh since sandbox was just claimed and state is already up-to-date | ||||||||||||||||||||||||||
| if err = m.syncRoute(ctx, sandbox, false); err != nil { | ||||||||||||||||||||||||||
|
|
@@ -151,37 +175,54 @@ func (m *SandboxManager) syncRoute(ctx context.Context, sbx infra.Sandbox, refre | |||||||||||||||||||||||||
| route := sbx.GetRoute() | ||||||||||||||||||||||||||
| m.proxy.SetRoute(ctx, route) | ||||||||||||||||||||||||||
| err := m.proxy.SyncRouteWithPeers(route) | ||||||||||||||||||||||||||
| duration := float64(time.Since(start).Milliseconds()) | ||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||
| log.Error(err, "failed to sync route with peers") | ||||||||||||||||||||||||||
| RouteSyncTotal.WithLabelValues("sync_with_peers", "failure").Inc() | ||||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| RouteSyncDuration.WithLabelValues("sync_with_peers").Observe(duration) | ||||||||||||||||||||||||||
|
Comment on lines
179
to
+184
|
||||||||||||||||||||||||||
| if err != nil { | |
| log.Error(err, "failed to sync route with peers") | |
| RouteSyncTotal.WithLabelValues("sync_with_peers", "failure").Inc() | |
| return err | |
| } | |
| RouteSyncDuration.WithLabelValues("sync_with_peers").Observe(duration) | |
| RouteSyncDuration.WithLabelValues("sync_with_peers").Observe(duration) | |
| if err != nil { | |
| log.Error(err, "failed to sync route with peers") | |
| RouteSyncTotal.WithLabelValues("sync_with_peers", "failure").Inc() | |
| return err | |
| } |
Copilot
AI
Apr 22, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RouteSyncDelay is being set to the sync operation duration, which makes it effectively another latency metric (and overlaps with RouteSyncDuration). If the intent is “delay from route update to peer sync completion”, this should be computed from a route/update timestamp; otherwise consider renaming/adjusting the metric Help text to reflect it’s the last observed sync duration.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now uses
LoadAndDeleteinstead of Delete.sync.Map.LoadAndDeletereturns whether the key existed, which is required to conditionally decrementRoutesTotal