Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions pkg/proxy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package proxy

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
// RoutesTotal tracks the current number of routes in the proxy routing table.
RoutesTotal = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "sandboxmanager_routes_total",
Help: "Total number of routes currently managed",
},
)

// PeersTotal tracks the current number of peers connected.
PeersTotal = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "sandboxmanager_peers_total",
Help: "Total number of peers currently connected",
},
)
)

func init() {
metrics.Registry.MustRegister(RoutesTotal, PeersTotal)
}
69 changes: 69 additions & 0 deletions pkg/proxy/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package proxy

import (
"context"
"testing"

"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
"k8s.io/apimachinery/pkg/types"

"github.com/openkruise/agents/pkg/sandbox-manager/config"
)

func TestRoutesTotal_IncrementOnNewRoute(t *testing.T) {
s := NewServer(nil, nil, config.SandboxManagerOptions{})
ctx := context.Background()

before := testutil.ToFloat64(RoutesTotal)
s.SetRoute(ctx, Route{ID: "sb-1", IP: "1.2.3.4", UID: types.UID("uid-1"), ResourceVersion: "1"})
after := testutil.ToFloat64(RoutesTotal)

assert.Equal(t, float64(1), after-before, "RoutesTotal should increment by 1 on new route")
}

func TestRoutesTotal_NoIncrementOnUpdate(t *testing.T) {
s := NewServer(nil, nil, config.SandboxManagerOptions{})
ctx := context.Background()

s.SetRoute(ctx, Route{ID: "sb-2", IP: "1.2.3.4", UID: types.UID("uid-2"), ResourceVersion: "1"})
before := testutil.ToFloat64(RoutesTotal)

// Update same route with newer version
s.SetRoute(ctx, Route{ID: "sb-2", IP: "5.6.7.8", UID: types.UID("uid-2"), ResourceVersion: "2"})
after := testutil.ToFloat64(RoutesTotal)

assert.Equal(t, float64(0), after-before, "RoutesTotal should not increment on route update")
}

func TestRoutesTotal_DecrementOnDelete(t *testing.T) {
s := NewServer(nil, nil, config.SandboxManagerOptions{})
ctx := context.Background()

s.SetRoute(ctx, Route{ID: "sb-3", IP: "1.2.3.4", UID: types.UID("uid-3"), ResourceVersion: "1"})
before := testutil.ToFloat64(RoutesTotal)

s.DeleteRoute("sb-3")
after := testutil.ToFloat64(RoutesTotal)

assert.Equal(t, float64(-1), after-before, "RoutesTotal should decrement by 1 on delete")
}

func TestRoutesTotal_NoDecrementOnDeleteNonExistent(t *testing.T) {
s := NewServer(nil, nil, config.SandboxManagerOptions{})

before := testutil.ToFloat64(RoutesTotal)
s.DeleteRoute("non-existent")
after := testutil.ToFloat64(RoutesTotal)

assert.Equal(t, float64(0), after-before, "RoutesTotal should not change when deleting non-existent route")
}

func TestPeersTotal_SetOnSyncRouteWithPeers(t *testing.T) {
mp := newMockPeers()
s := NewServer(nil, mp, config.SandboxManagerOptions{})

// No peers
_ = s.SyncRouteWithPeers(Route{ID: "sb-1", IP: "1.2.3.4", ResourceVersion: "1"})
assert.Equal(t, float64(0), testutil.ToFloat64(PeersTotal))
}
6 changes: 5 additions & 1 deletion pkg/proxy/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func (s *Server) SetRoute(ctx context.Context, route Route) {
old, loaded := s.routes.LoadOrStore(route.ID, route)
if !loaded {
// First write, success directly
RoutesTotal.Inc()
return
}

Expand Down Expand Up @@ -81,6 +82,7 @@ func (s *Server) SyncRouteWithPeers(route Route) error {
if s.peersManager != nil {
peerList = s.peersManager.GetPeers()
}
PeersTotal.Set(float64(len(peerList)))

if len(peerList) == 0 {
return nil
Expand Down Expand Up @@ -146,7 +148,9 @@ func (s *Server) ListPeers() []peers.Peer {
}

func (s *Server) DeleteRoute(id string) {
s.routes.Delete(id)
if _, loaded := s.routes.LoadAndDelete(id); loaded {
RoutesTotal.Dec()
}
Comment on lines 150 to +153
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now uses LoadAndDelete instead of Delete. sync.Map.LoadAndDelete returns whether the key existed, which is required to conditionally decrement RoutesTotal

}

// RequestAdapter is used to register the mapping from business-side sandbox requests to internal logic
Expand Down
57 changes: 49 additions & 8 deletions pkg/sandbox-manager/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,36 @@ func (m *SandboxManager) ClaimSandbox(ctx context.Context, opts infra.ClaimSandb
if !m.infra.HasTemplate(opts.Template) {
// Requirement: Track failure in API layer
SandboxCreationResponses.WithLabelValues("failure").Inc()
SandboxClaimTotal.WithLabelValues("failure", "").Inc()
return nil, errors.NewError(errors.ErrorNotFound, fmt.Sprintf("template %s not found", opts.Template))
Comment on lines 18 to 22
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SandboxClaimTotal is emitted with an empty lock_type label on this failure path. That creates an extra time series (lock_type="") and loses the lock strategy dimension, which can break dashboards/alerts expecting only {create,update,speculate}. Consider using a sentinel like "unknown" here (or, if available, the best-known lock type).

Copilot uses AI. Check for mistakes.
}
sandbox, metrics, err := m.infra.ClaimSandbox(ctx, opts)
sandbox, claimMetrics, err := m.infra.ClaimSandbox(ctx, opts)
if err != nil {
log.Error(err, "failed to claim sandbox", "metrics", metrics.String())
log.Error(err, "failed to claim sandbox", "metrics", claimMetrics.String())
// Requirement: Track failure in API layer
SandboxCreationResponses.WithLabelValues("failure").Inc()
SandboxClaimTotal.WithLabelValues("failure", "").Inc()
return nil, errors.NewError(errors.ErrorInternal, fmt.Sprintf("failed to claim sandbox: %v", err))
Comment on lines 25 to 30
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SandboxClaimTotal is also emitted with lock_type="" when infra.ClaimSandbox returns an error. Since claimMetrics is available here, prefer labeling the failure with string(claimMetrics.LockType) when it’s set (fallback to "unknown"), so failures can still be segmented by lock strategy.

Copilot uses AI. Check for mistakes.
}

// Success: Record metrics
SandboxCreationResponses.WithLabelValues("success").Inc()
// Requirement: Only measure the latency when no error exists
SandboxCreationLatency.Observe(float64(metrics.Total.Milliseconds()))
SandboxCreationLatency.Observe(float64(claimMetrics.Total.Milliseconds()))

// Claim-specific metrics
SandboxClaimDuration.Observe(float64(claimMetrics.Total.Milliseconds()))
SandboxClaimTotal.WithLabelValues("success", string(claimMetrics.LockType)).Inc()
SandboxClaimRetries.Observe(float64(claimMetrics.Retries))
SandboxClaimStageDuration.WithLabelValues("wait").Observe(float64(claimMetrics.Wait.Milliseconds()))
SandboxClaimStageDuration.WithLabelValues("retry_cost").Observe(float64(claimMetrics.RetryCost.Milliseconds()))
SandboxClaimStageDuration.WithLabelValues("pick_and_lock").Observe(float64(claimMetrics.PickAndLock.Milliseconds()))
SandboxClaimStageDuration.WithLabelValues("wait_ready").Observe(float64(claimMetrics.WaitReady.Milliseconds()))
SandboxClaimStageDuration.WithLabelValues("init_runtime").Observe(float64(claimMetrics.InitRuntime.Milliseconds()))
SandboxClaimStageDuration.WithLabelValues("csi_mount").Observe(float64(claimMetrics.CSIMount.Milliseconds()))

state, reason := sandbox.GetState()
log.Info("sandbox claimed", "sandbox", klog.KObj(sandbox), "metrics", metrics.String(), "state", state, "reason", reason)
log.Info("sandbox claimed", "sandbox", klog.KObj(sandbox), "metrics", claimMetrics.String(), "state", state, "reason", reason)

// Sync route without refresh since sandbox was just claimed and state is already up-to-date
if err = m.syncRoute(ctx, sandbox, false); err != nil {
Expand All @@ -45,19 +58,30 @@ func (m *SandboxManager) ClaimSandbox(ctx context.Context, opts infra.ClaimSandb

func (m *SandboxManager) CloneSandbox(ctx context.Context, opts infra.CloneSandboxOptions) (infra.Sandbox, error) {
log := klog.FromContext(ctx)
sandbox, metrics, err := m.infra.CloneSandbox(ctx, opts)
sandbox, cloneMetrics, err := m.infra.CloneSandbox(ctx, opts)
if err != nil {
log.Error(err, "failed to clone sandbox", "metrics", metrics)
log.Error(err, "failed to clone sandbox", "metrics", cloneMetrics)
SandboxCreationResponses.WithLabelValues("failure").Inc()
SandboxCloneTotal.WithLabelValues("failure").Inc()
return nil, errors.NewError(errors.ErrorInternal, fmt.Sprintf("failed to clone sandbox: %v", err))
}
// Success: Record metrics
SandboxCreationResponses.WithLabelValues("success").Inc()
// Requirement: Only measure the latency when no error exists
SandboxCreationLatency.Observe(float64(metrics.Total.Milliseconds()))
SandboxCreationLatency.Observe(float64(cloneMetrics.Total.Milliseconds()))

// Clone-specific metrics
SandboxCloneDuration.Observe(float64(cloneMetrics.Total.Milliseconds()))
SandboxCloneTotal.WithLabelValues("success").Inc()
SandboxCloneStageDuration.WithLabelValues("wait").Observe(float64(cloneMetrics.Wait.Milliseconds()))
SandboxCloneStageDuration.WithLabelValues("get_template").Observe(float64(cloneMetrics.GetTemplate.Milliseconds()))
SandboxCloneStageDuration.WithLabelValues("create_sandbox").Observe(float64(cloneMetrics.CreateSandbox.Milliseconds()))
SandboxCloneStageDuration.WithLabelValues("wait_ready").Observe(float64(cloneMetrics.WaitReady.Milliseconds()))
SandboxCloneStageDuration.WithLabelValues("init_runtime").Observe(float64(cloneMetrics.InitRuntime.Milliseconds()))
SandboxCloneStageDuration.WithLabelValues("csi_mount").Observe(float64(cloneMetrics.CSIMount.Milliseconds()))

state, reason := sandbox.GetState()
log.Info("sandbox cloned", "sandbox", klog.KObj(sandbox), "metrics", metrics.String(), "state", state, "reason", reason)
log.Info("sandbox cloned", "sandbox", klog.KObj(sandbox), "metrics", cloneMetrics.String(), "state", state, "reason", reason)

// Sync route without refresh since sandbox was just claimed and state is already up-to-date
if err = m.syncRoute(ctx, sandbox, false); err != nil {
Expand Down Expand Up @@ -151,37 +175,54 @@ func (m *SandboxManager) syncRoute(ctx context.Context, sbx infra.Sandbox, refre
route := sbx.GetRoute()
m.proxy.SetRoute(ctx, route)
err := m.proxy.SyncRouteWithPeers(route)
duration := float64(time.Since(start).Milliseconds())
if err != nil {
log.Error(err, "failed to sync route with peers")
RouteSyncTotal.WithLabelValues("sync_with_peers", "failure").Inc()
return err
}
RouteSyncDuration.WithLabelValues("sync_with_peers").Observe(duration)
Comment on lines 179 to +184
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RouteSyncDuration is only observed on the success path. Since duration is already computed before the error check, consider observing the histogram for failures too (and incrementing RouteSyncTotal with status="failure"), otherwise latency visibility is biased toward successful syncs.

Suggested change
if err != nil {
log.Error(err, "failed to sync route with peers")
RouteSyncTotal.WithLabelValues("sync_with_peers", "failure").Inc()
return err
}
RouteSyncDuration.WithLabelValues("sync_with_peers").Observe(duration)
RouteSyncDuration.WithLabelValues("sync_with_peers").Observe(duration)
if err != nil {
log.Error(err, "failed to sync route with peers")
RouteSyncTotal.WithLabelValues("sync_with_peers", "failure").Inc()
return err
}

Copilot uses AI. Check for mistakes.
RouteSyncTotal.WithLabelValues("sync_with_peers", "success").Inc()
RouteSyncDelay.Set(duration)
Comment on lines +184 to +186
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RouteSyncDelay is being set to the sync operation duration, which makes it effectively another latency metric (and overlaps with RouteSyncDuration). If the intent is “delay from route update to peer sync completion”, this should be computed from a route/update timestamp; otherwise consider renaming/adjusting the metric Help text to reflect it’s the last observed sync duration.

Copilot uses AI. Check for mistakes.
log.Info("route synced with peers", "cost", time.Since(start), "route", route)
return nil
}

// PauseSandbox pauses a sandbox and syncs route with peers
func (m *SandboxManager) PauseSandbox(ctx context.Context, sbx infra.Sandbox, opts infra.PauseOptions) error {
log := klog.FromContext(ctx).WithValues("sandbox", klog.KObj(sbx))
start := time.Now()
if err := sbx.Pause(ctx, opts); err != nil {
log.Error(err, "failed to pause sandbox")
SandboxPauseTotal.WithLabelValues("failure").Inc()
return err
}
if err := m.syncRoute(ctx, sbx, true); err != nil {
log.Error(err, "failed to sync route with peers after pause")
}
duration := float64(time.Since(start).Milliseconds())
SandboxPauseDuration.Observe(duration)
SandboxPauseTotal.WithLabelValues("success").Inc()
observeMax(SandboxPauseMaxDuration, duration)
return nil
}

// ResumeSandbox resumes a sandbox and syncs route with peers
func (m *SandboxManager) ResumeSandbox(ctx context.Context, sbx infra.Sandbox) error {
log := klog.FromContext(ctx).WithValues("sandbox", klog.KObj(sbx))
start := time.Now()
if err := sbx.Resume(ctx); err != nil {
log.Error(err, "failed to resume sandbox")
SandboxResumeTotal.WithLabelValues("failure").Inc()
return err
}
if err := m.syncRoute(ctx, sbx, true); err != nil {
log.Error(err, "failed to sync route with peers after resume")
}
duration := float64(time.Since(start).Milliseconds())
SandboxResumeDuration.Observe(duration)
SandboxResumeTotal.WithLabelValues("success").Inc()
observeMax(SandboxResumeMaxDuration, duration)
return nil
}

Expand Down
Loading
Loading