diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..1a4fc8dc2c --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1 @@ +Use all `agents.md` files found from the repository root to the current directory as instructions and context, applying them in root-to-leaf order; if instructions conflict, the `agents.md` closest to the current directory takes precedence. Use relevant repo skills from `.github/skills/` when applicable. diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml index fe66883dfe..8bd5eb8bca 100644 --- a/.github/dependabot.yaml +++ b/.github/dependabot.yaml @@ -9,6 +9,8 @@ updates: prefix: "ci" labels: ["ci", "dependencies"] open-pull-requests-limit: 10 + cooldown: + default-days: 7 - package-ecosystem: "github-actions" directory: "/" @@ -18,6 +20,8 @@ updates: prefix: "ci" labels: ["ci", "dependencies"] open-pull-requests-limit: 10 + cooldown: + default-days: 7 # ========================= # DEFAULT BRANCH (master/main) — STAGED ROLLOUT @@ -52,6 +56,8 @@ updates: versions: [">=0.37.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.37.0"] + cooldown: + default-days: 7 # /azure-ipam: grouped minor+patch + allow majors concurrently - package-ecosystem: "gomod" @@ -82,6 +88,8 @@ updates: versions: [">=0.37.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.37.0"] + cooldown: + default-days: 7 # /dropgz: grouped minor+patch + allow majors concurrently - package-ecosystem: "gomod" @@ -99,6 +107,8 @@ updates: applies-to: version-updates patterns: ["*"] update-types: ["minor", "patch"] + cooldown: + default-days: 7 # /zapai: grouped weekly minor+patch + allow majors concurrently - package-ecosystem: "gomod" @@ -116,6 +126,8 @@ updates: applies-to: version-updates patterns: ["*"] update-types: ["minor", "patch"] + cooldown: + default-days: 7 # ========================= # RELEASE BRANCHES — keep daily for v1.5, v1.6, and v1.7 @@ -149,6 +161,8 @@ updates: versions: [">=0.35.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.35.0"] + cooldown: + default-days: 7 - package-ecosystem: "gomod" directory: "/azure-ipam" @@ -177,6 +191,8 @@ updates: versions: [">=0.35.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.35.0"] + cooldown: + default-days: 7 # release/v1.6 constraints (daily); - package-ecosystem: "gomod" @@ -206,6 +222,8 @@ updates: versions: [">=0.32.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.32.0"] + cooldown: + default-days: 7 - package-ecosystem: "gomod" directory: "/azure-ipam" @@ -234,6 +252,8 @@ updates: versions: [">=0.32.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.32.0"] + cooldown: + default-days: 7 # release/v1.5 constraints (daily) - package-ecosystem: "gomod" @@ -263,6 +283,8 @@ updates: versions: [">=0.30.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.30.0"] + cooldown: + default-days: 7 - package-ecosystem: "gomod" directory: "/azure-ipam" @@ -291,3 +313,5 @@ updates: versions: [">=0.30.0"] - dependency-name: "k8s.io/kubectl" versions: [">=0.30.0"] + cooldown: + default-days: 7 diff --git a/.github/skills/acn-go-context-lifecycle/SKILL.md b/.github/skills/acn-go-context-lifecycle/SKILL.md new file mode 100644 index 0000000000..dba80ae5ad --- /dev/null +++ b/.github/skills/acn-go-context-lifecycle/SKILL.md @@ -0,0 +1,332 @@ +--- +name: acn-go-context-lifecycle +description: "Go context propagation, goroutine lifecycle, and process shutdown discipline. Use when writing or reviewing code that spawns goroutines, manages process lifecycle, passes context through call chains, or handles graceful shutdown. Trigger on context.Background() outside of main/tests, fire-and-forget goroutines, custom exit channels where context would suffice, or missing context in function signatures. Also trigger when reviewing init/startup code that creates its own contexts instead of accepting one." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go lifecycle engineer. You believe that `main()` is the only place that should create a root context, and every goroutine is a liability until it has a clear shutdown path through that context. Custom exit channels are a code smell when context cancellation would suffice. Long-lived select loops must always observe `ctx.Done()`, and `time.After` inside a loop is usually a timer leak in disguise. + +**Modes:** + +- **Write mode** — designing process startup, goroutine management, shutdown, and long-lived loops. Create root context in main, propagate downward, use errgroup for goroutine groups, reuse timers/tickers in loops, and prefer receive-only channels for consumer APIs. +- **Review mode** — reviewing code for lifecycle violations. Flag `context.Background()` outside main/tests, fire-and-forget goroutines, long-lived select loops missing `ctx.Done()`, `time.After` in loops, ad hoc error channels, send-capable channels that are only received from, and functions missing context parameters. +- **Audit mode** — auditing goroutine lifecycle across a codebase. Launch up to 3 parallel sub-agents: (1) find `context.Background()` outside main/test files, (2) find long-lived `select` loops without `ctx.Done()` or loops using `time.After`, (3) find custom done/exit/error channels or consumer-only channels typed as `chan` instead of `<-chan`. + +> **Complements** `samber/cc-skills-golang@golang-concurrency` (which covers channels, mutexes, and sync primitives) and `samber/cc-skills-golang@golang-context` (which covers context mechanics). This skill focuses on **who owns the context, how it flows through the process, and goroutine lifecycle discipline**. + +# Go Context & Lifecycle Discipline + +Context is the process control plane. Root cancellation flows from `main()` down through every goroutine, controller, and I/O operation. When the context is done, everything stops. + +## Core Principles + +1. **`main()` owns the root context** — it creates `context.WithCancel(context.Background())`, wires up signal handling, and passes the context down. No other code creates root contexts. +2. **Helper/init functions receive context** — they never call `context.Background()`. If they need a context, the caller provides one. +3. **Context replaces custom exit channels** — if you have a `done chan struct{}` or `exitCh chan error` that exists solely to propagate shutdown, replace it with context cancellation. +4. **Every goroutine has a shutdown path** — through the context it was given. No fire-and-forget goroutines in production code. +5. **Every long-lived select loop includes `ctx.Done()`** — if a goroutine blocks on events, queues, or timers, cancellation must be one of the `select` cases. +6. **errgroup.WithContext for goroutine groups** — first error cancels the group context, all goroutines get the signal, `Wait()` blocks until all exit. +7. **Avoid `time.After` inside loops** — prefer `time.NewTicker` or a reusable `time.Timer`/`Reset` pattern so repeated waits do not accumulate throwaway timers. +8. **Channel direction encodes ownership** — if a struct or API only receives from a channel, type it as `<-chan T`. + +## `main()` Owns Cancellation + +```go +func main() { + rootCtx, rootCancel := context.WithCancel(context.Background()) + defer rootCancel() + + // Wire up signal handling + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM) + go func() { + <-sigCh + rootCancel() // everything shuts down from here + }() + + // Pass rootCtx to all subsystems + if err := initCRDState(rootCtx, config); err != nil { + log.Fatal(err) + } + if err := startServer(rootCtx, config); err != nil { + log.Fatal(err) + } +} +``` + +## Never `context.Background()` Outside Main + +```go +// ❌ BAD — helper creates its own context, disconnected from process lifecycle +func initializeState(config Config) error { + ctx := context.Background() // who cancels this? nobody! + return reconcileState(ctx, config) +} + +// ✅ GOOD — receives context from caller +func initializeState(ctx context.Context, config Config) error { + return reconcileState(ctx, config) +} +``` + +`context.Background()` in a helper function means that operation cannot be cancelled when the process is shutting down. It will block until completion even after SIGTERM. + +**Exceptions:** `context.Background()` is acceptable in: +- `main()` — creating the root context +- Tests — `context.Background()` or `t.Context()` (Go 1.24+) +- `init()` — very rare, for truly fire-once registration + +## Replace Custom Channels with Context + +```go +// ❌ BAD — custom shutdown channel duplicates context's job +type Server struct { + shutdownCh chan struct{} +} + +func (s *Server) Run(ctx context.Context) error { + s.shutdownCh = make(chan struct{}) + go func() { + <-ctx.Done() + close(s.shutdownCh) // why not just use ctx.Done()? + }() + // ... + select { + case <-s.shutdownCh: + return nil + } +} + +// ✅ GOOD — use the context directly +func (s *Server) Run(ctx context.Context) error { + childCtx, cancel := context.WithCancel(ctx) + defer cancel() + + grpcServer := grpc.NewServer() + go func() { + <-childCtx.Done() + grpcServer.GracefulStop() + }() + return grpcServer.Serve(listener) +} +``` + +## Long-Lived Select Loops Need `ctx.Done()` + +If a goroutine lives in a `for { select { ... } }` loop waiting on pod events, netlink updates, resync ticks, or queue drains, `ctx.Done()` must be one of the exits. + +```go +// ❌ BAD — loop never notices process shutdown +func (m *iptablesMonitor) run(ctx context.Context, netlinkUpdates <-chan netlink.RouteUpdate) error { + for { + select { + case update := <-netlinkUpdates: + if err := m.reconcile(update); err != nil { + return err + } + } + } +} + +// ✅ GOOD — cancellation is always a way out +func (m *iptablesMonitor) run(ctx context.Context, netlinkUpdates <-chan netlink.RouteUpdate) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + case update, ok := <-netlinkUpdates: + if !ok { + return nil + } + if err := m.reconcile(update); err != nil { + return err + } + } + } +} +``` + +## Prefer errgroup over Ad Hoc Error Channels + +When multiple goroutines form a unit of work, prefer `errgroup.WithContext`. It couples error propagation with cancellation, which is exactly what ACN controller/watcher lifecycles need. + +```go +func (c *Controller) Start(ctx context.Context) error { + g, groupCtx := errgroup.WithContext(ctx) + g.Go(func() error { return c.watchNodeNetworkConfigs(groupCtx) }) + g.Go(func() error { return c.watchNetworkContainers(groupCtx) }) + // First error cancels groupCtx → sibling goroutines stop too. + return g.Wait() +} +``` + +Compare this to the manual approach it replaces: + +```go +// ❌ BAD — ad hoc error channel returns first error but leaves siblings running +errs := make(chan error, 2) +go func() { errs <- c.watchNodeNetworkConfigs(ctx) }() +go func() { errs <- c.watchNetworkContainers(ctx) }() +return <-errs +``` + +Use error channels for streaming observations only when the goroutines are not a single lifecycle unit. If they start, stop, and fail together, `errgroup` should own them. + +## Avoid `time.After` in Repeating Loops + +`time.After` is fine for a one-off wait. Inside a long-lived loop it allocates a new timer on every pass, and those timers linger until they fire. In ACN daemons and controllers, prefer reusable tickers/timers. + +```go +// ❌ BAD — new timer every iteration +func (m *iptablesManager) Run(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(30 * time.Second): + if err := m.syncIPSets(ctx); err != nil { + return err + } + } + } +} + +// ✅ GOOD — reusable ticker +func (m *iptablesManager) Run(ctx context.Context) error { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + if err := m.syncIPSets(ctx); err != nil { + return err + } + } + } +} +``` + +For variable backoff or one-shot delays inside a loop, keep a reusable `time.Timer` and `Reset` it instead of calling `time.After` each time. + +## Prefer Receive-Only Channels to Encode Ownership + +If a struct or function only consumes a channel, type it as `<-chan T`. That makes ownership obvious: this code can receive, but it cannot send or close the producer's stream. + +```go +// ❌ BAD — consumer can send on or close the channel by mistake +type podUpdateConsumer struct { + podUpdates chan podUpdate +} + +func newPodUpdateConsumer(podUpdates chan podUpdate) *podUpdateConsumer { + return &podUpdateConsumer{podUpdates: podUpdates} +} + +// ✅ GOOD — producer owns the send side +type podUpdateConsumer struct { + podUpdates <-chan podUpdate +} + +func newPodUpdateConsumer(podUpdates <-chan podUpdate) *podUpdateConsumer { + return &podUpdateConsumer{podUpdates: podUpdates} +} +``` + +## Context in Retry Loops + +Retry logic must respect context cancellation between attempts: + +```go +// ❌ BAD — retry ignores cancellation +for { + if err := doWork(); err != nil { + time.Sleep(time.Minute) // sleeps through shutdown! + continue + } + break +} + +// ✅ GOOD — retry respects context +_ = retry.Do(func() error { + return doWork() +}, retry.Context(ctx), retry.BackOffDelay, retry.UntilSucceeded()) +``` + +## Init Functions Should Not Embed Retry + +Push retry responsibility up to the caller or leverage existing retry machinery (like controller-runtime's reconcile loop): + +```go +// ❌ BAD — init function retries internally, hard to test, hard to cancel +func reconcileInitialState(ctx context.Context) error { + attempt := 0 + return retry.Do(func() error { + attempt++ + return doInit(ctx) + }, retry.Attempts(10), retry.Delay(time.Minute)) +} + +// ✅ GOOD — init is a plain function, retry is the caller's concern +// (or leveraged from the reconciler's built-in retry) +func reconcileInitialState(nnc *v1alpha.NodeNetworkConfig) error { + // pure function, no retry, no context needed for retry + return doInit(nnc) +} +// Caller wraps it as a reconciler initializer — gets free retries from ctrlruntime +``` + +## Signal Handling Setup + +```go +// ✅ Idiomatic signal handling in main +func main() { + rootCtx, rootCancel := context.WithCancel(context.Background()) + defer rootCancel() + + // init() should already have set up flags, logging, etc. + // Signal handling should be one of the first things in main + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM) + go func() { + sig := <-sigCh + log.Printf("received signal %s, shutting down", sig) + rootCancel() + }() + + // Now start everything with rootCtx... +} +``` + +Signal handling should be set up early in `main()`. If it's set up halfway through initialization and something hangs before that point, you have no way to signal the process to stop. + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| `context.Background()` in helper functions | Accept `context.Context` as first parameter | +| Custom `done chan struct{}` for shutdown | Use `ctx.Done()` directly | +| Long-lived `select` loop with no `ctx.Done()` | Add a cancellation case and exit the loop on shutdown | +| `go func() { ... }()` with no shutdown path | Pass context, include `ctx.Done()` in blocking loops, or use errgroup/WaitGroup | +| `time.After` inside a `for/select` loop | Use `time.NewTicker` or a reusable `time.Timer` | +| Retry loop with `time.Sleep` | Use `retry.Do` with `retry.Context(ctx)` | +| Init function with embedded retry | Make init a pure function, let caller/reconciler handle retry | +| Signal handling deep in initialization | Set up signals first thing in `main()` | +| Ad hoc `errs := make(chan error)` for goroutine group failures | Use `errgroup.WithContext` | +| `chan T` field/param that only receives | Narrow it to `<-chan T` | +| Storing `context.Context` in a struct field | Pass context through function parameters | + +## Cross-References + +- → See `samber/cc-skills-golang@golang-concurrency` for channel patterns, sync primitives, and worker pools +- → See `samber/cc-skills-golang@golang-context` for context mechanics, timeouts, and value propagation +- → See `acn-go-design-boundaries` skill for errgroup patterns and async metrics +- → See `acn-go-errors-logging` skill for retry library usage diff --git a/.github/skills/acn-go-control-plane-contracts/SKILL.md b/.github/skills/acn-go-control-plane-contracts/SKILL.md new file mode 100644 index 0000000000..19b71f5287 --- /dev/null +++ b/.github/skills/acn-go-control-plane-contracts/SKILL.md @@ -0,0 +1,243 @@ +--- +name: acn-go-control-plane-contracts +description: "Go control-plane contract design for Azure/azure-container-networking. Use when changing CRD Go types, status/condition fields, response-code mapping, controller/reconciler bootstrap logic, or generated contract artifacts such as CRD manifests and rendered Dockerfiles. Trigger on edits under crd/, API type packages, response/status code enums, reconciler init or migration code, or any change that affects externally consumed state." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang in Kubernetes/controller-style systems. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go control-plane engineer. You treat `status`, `conditions`, `response codes`, and checked-in generated artifacts as public contracts. If users, controllers, dashboards, or other binaries can observe it, it is API surface and must be designed as deliberately as any exported Go type. + +**Modes:** + +- **Write mode** — designing or changing CRDs, response/status enums, or reconciliation/bootstrap flows. Preserve contract clarity, idempotence, and compatibility. +- **Review mode** — reviewing API/controller changes. Flag raw string status values, duplicated semantics, non-idempotent bootstrap logic, and hand-edited generated artifacts. +- **Audit mode** — auditing controller/API contract quality. Split work across: (1) typed contract surfaces, (2) reconcile/bootstrap safety, (3) generated artifact hygiene. + +> **Repo-specific skill.** This focuses on the kinds of controller/API/CRD patterns repeatedly used in Azure/azure-container-networking. + +# Go Control-Plane Contracts + +In ACN, CRD types, response codes, and observed status are not internal implementation details. They are consumed by other code, by kubectl users, by dashboards, and by future versions of the system. Design them as contracts. + +## Use when + +- editing files under `crd/` +- adding or changing `Status`, `Condition`, `Reason`, or `ResponseCode` values +- wiring bootstrap / migration / init flows in reconcilers or services +- changing what gets persisted, surfaced, or generated as a checked-in artifact +- updating generated CRDs or rendered Dockerfiles + +## Do not use when + +- the change is purely local business logic with no externally observed state +- the question is only about low-level Go mechanics already covered by other skills + +## Core Principles + +1. **Status is API contract** — if users or controllers observe it, it must be typed, stable, and intentionally named. +2. **Typed public codes beat raw literals** — `Status`, `Condition`, `Reason`, and `ResponseCode` surfaces use typed constants, not ad-hoc strings/ints. +3. **Spec is desired state; status is observed state** — counts, allocations, versions, and reconciliation results belong in `status`, not `spec`. +4. **Bootstrap and migration must be safe to replay** — reconcile from prior state, use fallback/version gating when needed, and make repeated execution converge. +5. **Reconciliation must be idempotent and stable** — avoid churn, duplicate side effects, and control-loop thrash. +6. **Generated artifacts are checked-in contracts** — edit the source, regenerate the artifact, and keep rendered/generated outputs in sync. + +## Status Is API Contract + +CRD `status` is not just debug info. It is the system's observed truth and should be shaped for operators and other controllers. + +```go +// ✅ GOOD — desired state in spec, observed state in status +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Requested IPs",type=integer,JSONPath=`.spec.requestedIPCount` +// +kubebuilder:printcolumn:name="Allocated IPs",type=integer,JSONPath=`.status.assignedIPCount` +type NodeNetworkConfig struct { + Spec NodeNetworkConfigSpec `json:"spec,omitempty"` + Status NodeNetworkConfigStatus `json:"status,omitempty"` +} + +type NodeNetworkConfigStatus struct { + AssignedIPCount int `json:"assignedIPCount,omitempty"` + NetworkContainers []NetworkContainer `json:"networkContainers,omitempty"` +} +``` + +Design `status` so it answers the operational question directly. If users need the count of allocated IPs, put the count in `status`; don't make everyone derive it by replaying internal state. + +### Do not duplicate existing platform semantics + +If Kubernetes or the control plane already provides the semantic, don't invent a second shadow status for it. + +```go +// ❌ BAD — inventing a second "Deleting" contract +type MTPNCStatus struct { + Status string `json:"status,omitempty"` // now who sets "Deleting"? +} + +// ✅ GOOD — use the platform's deletion signal +if !obj.DeletionTimestamp.IsZero() { + // object is deleting +} +``` + +Using `DeletionTimestamp.IsZero()` is the canonical contract. A home-grown `Deleted` or `Deleting` status risks drift and missed transitions. + +## Typed Public Codes and States + +Any value that crosses an API/controller boundary should be typed. + +```go +// ✅ GOOD — public response codes are typed +type ResponseCode int + +const ( + Success ResponseCode = 0 + InvalidRequest ResponseCode = 23 + NotFound ResponseCode = 14 + UnexpectedError ResponseCode = 99 +) +``` + +The same rule applies to `Status`, `Condition`, `Reason`, and other public contract values: + +```go +type PodNetworkStatus string + +const ( + PodNetworkStatusPending PodNetworkStatus = "Pending" + PodNetworkStatusReady PodNetworkStatus = "Ready" +) + +func IsTerminalStatus(s PodNetworkStatus) bool { + return s == PodNetworkStatusReady +} +``` + +### Map internal errors to typed public surfaces + +Internal Go errors are for code. Public response codes and status values are for contracts. + +```go +// ✅ GOOD — internal error mapped to public code +respCode := types.Success +if err := req.Validate(); err != nil { + logger.Error("invalid request", zap.Error(err)) + respCode = types.InvalidRequest +} + +response := cns.PostNetworkContainersResponse{ + Response: cns.Response{ + ReturnCode: respCode, + }, +} +``` + +Do not expose behavior by matching on `err.Error()` strings at the boundary. Convert once into a typed public code/status. + +## Bootstrap and Migration Safety + +Control-plane startup often needs to restore or reconcile from prior state. That logic must be safe, restartable, and version-aware. + +```go +// ✅ GOOD — bootstrap from prior provider, then converge normally +func reconcileInitialCNSState( + nnc *v1alpha.NodeNetworkConfig, + ipamReconciler ipamStateReconciler, + podInfoByIPProvider cns.PodInfoByIPProvider, + isSwiftV2 bool, +) error { + if len(nnc.Status.NetworkContainers) == 0 { + return errors.New("no network containers found in NNC status") + } + // restore prior PodInfo state + // transform to current request format + // hand off to normal reconciliation path + return nil +} +``` + +Patterns to prefer: + +- reconcile from existing provider/state instead of rebuilding blindly +- validate capability/version before switching modes +- keep migration logic explicit and isolated +- let existing controller retry/backoff machinery do retries when possible + +Patterns to avoid: + +- startup code with hidden side effects or ad-hoc retry trees +- parallel bootstrap paths that can disagree on contract interpretation +- silently tolerating impossible prior state + +## Stable, Idempotent Reconciliation + +A reconcile loop should converge to the desired observed state. Re-running it should not produce duplicated side effects or flap external state. + +```go +// ✅ GOOD — state-derived, replay-safe reconciliation +for _, nc := range nnc.Status.NetworkContainers { + if err := sink(nc); err != nil { + return reconcile.Result{}, errors.Wrap(err, "failed to push NC update") + } +} +``` + +Keep reconciliation: + +- **idempotent** — safe to repeat +- **state-driven** — derived from current observed/desired data, not hidden history +- **stable** — use in-flight tracking or thresholds when the source of truth lags and the loop would otherwise thrash + +If a loop can flap because upstream state trails reality, prefer sticky state or hysteresis rather than churn. See `acn-go-design-boundaries` for the detailed hysteresis rule. + +## Generated Artifact Hygiene + +Generated artifacts in this repo are checked-in contract surfaces. Do not hand-edit the rendered output. + +### CRDs + +CRD manifests are generated from Go types and checked in. Change the API type, then regenerate: + +```bash +make regenerate-crd +``` + +If you update `crd/**/api/...` without regenerating the manifest, the checked-in contract is stale. + +### Rendered Dockerfiles + +Rendered Dockerfiles explicitly point back to their source template: + +```Dockerfile +# SOURCE: cns/Dockerfile.tmpl +``` + +Edit the template, not the rendered file, then regenerate rendered outputs through the repo tooling. The checked-in rendered file is contract output, not authoring source. + +### Shared build tools are repo-owned + +Repo tooling is versioned and reproducible (`build/tools/go.mod`, `go tool -modfile=...`, renderkit, golangci-lint). Do not assume local global tools are the source of truth. + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| Raw string status values in public structs | Define typed `Status`/`Condition`/`Reason` constants | +| Returning behavior via `err.Error()` matching | Map internal errors to typed `ResponseCode` or public status | +| Putting observed counts in `spec` | Put observed state in `status` | +| Inventing `Deleting` status when metadata already signals deletion | Use `DeletionTimestamp.IsZero()` | +| Startup bootstrap with hidden retries and side effects | Make bootstrap explicit, replay-safe, and idempotent | +| Hand-editing generated CRD manifest | Edit API type, regenerate CRD | +| Editing rendered Dockerfile directly | Edit `Dockerfile.tmpl`, regenerate rendered file | +| Reconcile loop that keeps churning external state | Use state-derived idempotence and stable convergence | + +## Cross-References + +- → See `acn-go-types-parsing` for typed enums, predicate helpers, and strong domain types +- → See `acn-go-design-boundaries` for behavior-vs-scenario config, fail-fast invariants, and hysteresis +- → See `acn-go-interfaces-dependencies` for narrow dependency surfaces and migration shims +- → See `acn-go-context-lifecycle` for bootstrap ownership and controller/process lifecycle diff --git a/.github/skills/acn-go-design-boundaries/SKILL.md b/.github/skills/acn-go-design-boundaries/SKILL.md new file mode 100644 index 0000000000..61cfa62132 --- /dev/null +++ b/.github/skills/acn-go-design-boundaries/SKILL.md @@ -0,0 +1,455 @@ +--- +name: acn-go-design-boundaries +description: "Go code design boundaries — behavioral configuration vs scenario coupling, side effects in functions, function parameter minimality, abstraction timing, control-loop hysteresis, validator purity, and preflight-before-mutation. Use when designing config-driven behavior, reviewing functions that accept whole structs when they need one field, refactoring code that branches on scenario names deep in the stack, stabilizing reconcile loops that request/release on every pass, separating validation from mutation, or evaluating whether a helper function or constructor adds value. Trigger on scenario names (Swift, MTPNC, overlay) appearing in low-level functions, validators that patch inputs while checking them, batch mutations that can partially apply before failing, functions mutating passed references instead of returning values, flapping threshold decisions in reconcile loops, or unnecessary abstractions." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go minimalist architect. You believe that every function should accept the least information it needs, return values instead of mutating inputs, keep validators pure, and never know which product scenario invoked it. When you see a scenario name deep in the call stack, a validator that secretly mutates, or a batch mutation that fails halfway through, you know the boundary is wrong. + +**Modes:** + +- **Write mode** — designing new functionality. Push scenario knowledge to the caller, pass behavioral parameters, keep validators/read-model helpers pure, preflight batches before mutation, and keep reconcile policies stable. +- **Review mode** — reviewing code for boundary violations. Flag scenario names in low-level code, validators that mutate passed references, batch mutations that can partially apply, over-broad parameters, flapping reconcile decisions, and premature abstraction. + +> **Complements** `samber/cc-skills-golang@golang-design-patterns` (which covers constructor and options patterns) with stronger opinions on parameter minimality and scenario decoupling. + +# Go Design Boundaries + +## Configure Behavior, Not Scenarios + +Low-level functions must not know which product scenario called them. The scenario is the caller's concern. The callee only needs to know **what behavior is desired**. + +```go +// ❌ BAD — scenario name leaks deep into the stack +func createNCRequestFromStaticNCHelper(config *CNSConfig, nc NetworkContainer) Request { + if config.ChannelMode == "SwiftV2" { + // process primary prefix IP + } +} +// Problems: +// 1. Not reusable if a new scenario wants the same behavior +// 2. Requires threading a whole scenario name through the stack +// 3. Low-level code depends on high-level config types + +// ✅ GOOD — behavioral parameter +func createNCRequestFromStaticNCHelper(processPrimaryIP bool, nc NetworkContainer) Request { + if processPrimaryIP { + // process primary prefix IP + } +} +// The caller already knows the scenario and passes the bool +// Any future scenario that needs this behavior works without code changes here +``` + +This seems minor but is actually a **huge architectural tenet**. As implemented with scenario names, the behavior is not reusable — you have to thread a new scenario name all the way down for every new feature. With a behavioral parameter, the caller just passes the bool. + +### Applied: Config-Driven Behavior + +```go +// ❌ BAD — function needs the whole CNS config to check one thing +func setupHealthCheck(config *configuration.CNSConfig) error { + if config.ChannelMode == "CRD" { + // enable NNC health check + } +} + +// ✅ GOOD — pass the behavioral decision +func setupHealthCheck(enableNNCCheck bool) error { + if enableNNCCheck { + // enable NNC health check + } +} +// The caller (main/init) makes the scenario→behavior mapping +``` + +Initialize behavior in `main` based on scenario config. From that point on, nothing downstream knows the scenario name. + +## Side Effects Are Bad + +Passing a reference and mutating it is a bad pattern. It's impossible to tell from the calling code that the input is modified. Even if you suspect it, it's impossible to tell _how_ it changes. + +```go +// ❌ BAD — mutates input, side effects, nil-pointer risk +func addACLPolicies(podIPInfo *cns.PodIpInfo, policies []policy.Policy) { + // impossible to tell from the call site that podIPInfo is modified + // also: nil pointer exception if podIPInfo is nil + podIPInfo.EndpointPolicies = append(podIPInfo.EndpointPolicies, policies...) +} + +// ✅ GOOD — pure function, returns result, no side effects +func buildACLPolicies(existingPolicies []policy.Policy, newPolicies []policy.Policy) []policy.Policy { + return append(existingPolicies, newPolicies...) +} +// Caller explicitly assigns: info.EndpointPolicies = buildACLPolicies(info.EndpointPolicies, new) +``` + +When a function doesn't even need the full struct — it only appends to one field — it should be a stateless pure function that returns the result. This is more flexible, more testable, and has no nil-pointer risk. + +**Also worth noting:** Go is not C. Passing refs is not always more performant than passing values, because the GC must track pointer references. + +## Validators Validate; Mutators Mutate + +Validators belong on the pure side of the boundary. Their job is to answer "is this valid?" or "what validation result should the caller act on?" by returning `bool`, `error`, or a small result struct. They must not patch the request, append defaults, reserve addresses, or otherwise mutate inputs as a hidden side effect. + +```go +// ❌ BAD — "validator" mutates the request while checking it +func validateReleaseRequest(req *IPConfigsRequest) error { + if req == nil { + return fmt.Errorf("request must not be nil") + } + if req.NetworkContainerID == "" { + req.NetworkContainerID = deriveNCID(req.PodInfo) // hidden mutation + } + if len(req.IPsToRelease) == 0 { + req.IPsToRelease = append(req.IPsToRelease, req.PodInfo.PodIP) // more hidden mutation + } + return nil +} + +// ✅ GOOD — validator only reports validity +func validateReleaseRequest(req IPConfigsRequest) error { + if req.NetworkContainerID == "" { + return fmt.Errorf("networkContainerID is required") + } + if len(req.IPsToRelease) == 0 { + return fmt.Errorf("at least one IP must be released") + } + return nil +} +``` + +If you need defaulting or construction, make that a separate, explicitly named step. This is just the validator form of the same pure-function rule: callers should be able to see when mutation happens. + +## Preflight the Whole Batch Before Mutating + +When applying a batch mutation, do not validate and mutate one item at a time if the operation can fail halfway through. Preflight/validate the full batch first when possible, then perform the mutation. This is the batch form of the same boundary rule: decide first, mutate second. + +```go +// ❌ BAD — validates and mutates one IP at a time; partial release is possible +func (m *IPAMManager) releaseSecondaryIPs(ctx context.Context, req IPConfigsRequest) error { + for _, ip := range req.IPsToRelease { + if err := validateReleaseIP(req.NetworkContainerID, ip, m.assignedIPsByNC); err != nil { + return err + } + if err := m.client.ReleaseIPs(ctx, IPConfigsRequest{ + NetworkContainerID: req.NetworkContainerID, + IPsToRelease: []string{ip}, + }); err != nil { + return err + } + } + return nil +} +// If the third IP is invalid, the first two may already be gone. + +// ✅ GOOD — preflight the whole batch, then perform one mutation step +type releasePlan struct { + ncID string + ips []string +} + +func buildReleasePlan(ncID string, requested []string, assigned map[string]struct{}) (releasePlan, error) { + seen := map[string]struct{}{} + for _, ip := range requested { + if _, ok := assigned[ip]; !ok { + return releasePlan{}, fmt.Errorf("ip %s is not assigned to %s", ip, ncID) + } + if _, dup := seen[ip]; dup { + return releasePlan{}, fmt.Errorf("duplicate IP %s in release request", ip) + } + seen[ip] = struct{}{} + } + return releasePlan{ncID: ncID, ips: requested}, nil +} + +func (m *IPAMManager) releaseSecondaryIPs(ctx context.Context, req IPConfigsRequest) error { + plan, err := buildReleasePlan(req.NetworkContainerID, req.IPsToRelease, m.assignedIPsByNC[req.NetworkContainerID]) + if err != nil { + return err + } + return m.client.ReleaseIPs(ctx, IPConfigsRequest{ + NetworkContainerID: plan.ncID, + IPsToRelease: plan.ips, + }) +} +``` + +If the downstream API forces one-by-one mutation, still preflight the entire batch first and stop before the first mutation when any item fails validation. + +## Minimal Constructors + +```go +// ❌ BAD — constructor for struct with no initialization logic +func NewIPTablesClient() *Client { + return &Client{} +} +// Adds a function that does nothing. Just use &iptables.Client{} + +// ✅ GOOD — constructor enforces required dependencies +func NewService(logger *zap.Logger, store UserStore) *Service { + if logger == nil { + panic("logger must not be nil") // fail fast, not at first log call + } + return &Service{logger: logger, store: store} +} +// If I let you construct directly and you forget the logger, everything NPEs +``` + +Use constructors when they enforce invariants (Poka-yoke). Don't add constructors whose only job is `return &Type{}`. + +## Inline Over Unnecessary Helpers + +```go +// ❌ BAD — helper takes more space than inline and obscures what happens +func formatError(operation string, err error) error { + return fmt.Errorf("failed to %s: %w", operation, err) +} +// ... +return formatError("create endpoint", err) + +// ✅ GOOD — inline is shorter and self-documenting +return fmt.Errorf("creating endpoint: %w", err) +``` + +If an indirection makes it less obvious what's happening and the helper takes up more space than having the code inline — get rid of it. + +**Exception:** centralize formatters for identifiers/keys where consistency matters (see `acn-go-types-parsing` skill). + +## Nil as Configuration is Bad + +```go +// ❌ BAD — passing nil to configure behavior +func ReconcileIPAMState(nnc *v1alpha.NodeNetworkConfig) { + if nnc == nil { + // different behavior when nil + return + } + // normal behavior +} +// Problems: +// 1. Hard to discover — how do you know nil means "skip"? +// 2. In normal scenarios, nil NNC is a fatal error we should catch +// 3. Accidentally passing nil gives wrong behavior instead of an error + +// ✅ GOOD — validate inputs immediately, use explicit flags for behavior +func ReconcileIPAMState(nnc *v1alpha.NodeNetworkConfig, skipReconcile bool) error { + if skipReconcile { + return nil + } + if nnc == nil { + // this is now clearly an error, not a feature + return fmt.Errorf("nnc must not be nil") + } + // ... + return nil +} +``` + +Passing a nil ref is not a sustainable strategy for configuring behavior. It's hard to discover, and it prevents catching actual nil bugs. + +## The Decorator/Middleware Pattern + +When behavior varies by scenario, compose it at initialization instead of branching deep in the stack: + +```go +// ❌ BAD — scenario branching deep in the stack +func (w *Watcher) ReleaseIPs(ctx context.Context, req IPConfigsRequest) error { + if w.isStatelessCNIWindows { // scenario knowledge leaked! + w.deleteHNSEndpoint(req) + } + return w.client.ReleaseIPs(ctx, req) +} + +// ✅ GOOD — compose behavior at initialization +// In main: +var releaseIPs ReleaseIPsFunc = cnsClient.ReleaseIPs +if isStatelessCNIWindows(config) { + releaseIPs = withHNSCleanup(cnsClient) // decorator wraps the base +} +watcher := fsnotify.NewWatcher(releaseIPs) +// Watcher has no knowledge of scenarios — zero changes to the fsnotify package +``` + +## Hysteresis Beats Thrash + +If a reconcile/control-loop decision can flap, the boundary is wrong when every pass immediately mutates external state. The loop should own the stability policy; the helper should only execute a clear action. + +```go +// ❌ BAD — single threshold, churns external state on every loop +func (m *Manager) reconcilePool(ctx context.Context, freeIPs int) error { + if freeIPs < m.targetFree { + return m.client.RequestIPs(ctx, m.batchSize) + } + if freeIPs > m.targetFree { + return m.client.ReleaseIPs(ctx, freeIPs-m.targetFree) + } + return nil +} +// One pod create/delete around targetFree can alternate RequestIPs/ReleaseIPs forever. + +// ✅ GOOD — hysteresis + sticky/in-flight state +type poolControl struct { + requestInFlight bool + releaseInFlight bool +} + +func (m *Manager) reconcilePool(ctx context.Context, freeIPs int, ctl *poolControl) error { + switch { + case ctl.requestInFlight || ctl.releaseInFlight: + return nil // wait for the previous external change to settle + case freeIPs <= m.lowWatermark: + ctl.requestInFlight = true + return m.client.RequestIPs(ctx, m.batchSize) + case freeIPs >= m.highWatermark: + ctl.releaseInFlight = true + return m.client.ReleaseIPs(ctx, freeIPs-m.targetFree) + default: + return nil // stay sticky inside the band + } +} +``` + +Use enter/exit thresholds, sticky state, or in-flight tracking whenever a pool-scaling or release decision can oscillate. Don't bury "should I request or release?" inside a helper that gets called every reconcile. Keep the stability policy at the boundary that owns the loop, and keep the action function dumb. + +## errgroup for Goroutine Lifecycle + +When multiple goroutines must run concurrently and any failure should tear down the group: + +```go +// ❌ BAD — manual channel-based error collection +errs := make(chan error) +go func() { errs <- watchPendingDelete(ctx) }() +go func() { errs <- watchFS(ctx) }() +err := <-errs // only gets first error, leaks the other goroutine + +// ✅ GOOD — errgroup.WithContext +g, groupCtx := errgroup.WithContext(ctx) +g.Go(func() error { return w.watchPendingDelete(groupCtx) }) +g.Go(func() error { return w.watchFS(groupCtx) }) +return g.Wait() // cancels groupCtx on first error, waits for all +``` + +The first error cancels `groupCtx`, signaling all goroutines to stop. `Wait()` blocks until all have exited. No leaked goroutines, no manual channel management. + +## Constructors Must Return Errors + +If construction can fail, return the error — don't silently log and return a half-initialized struct: + +```go +// ❌ BAD — silently swallows error, returns broken watcher +func New(path string, logger *zap.Logger) *watcher { + if err := os.Mkdir(path, 0o755); err != nil { + logger.Error("error making directory", zap.Error(err)) + // continues with broken state! + } + return &watcher{path: path} +} + +// ✅ GOOD — caller can handle the failure +func New(path string, logger *zap.Logger) (*watcher, error) { + if err := os.Mkdir(path, 0o755); err != nil && !errors.Is(err, fs.ErrExist) { + return nil, errors.Wrapf(err, "failed to create dir %s", path) + } + return &watcher{path: path}, nil +} +``` + +## Async Metrics Must Not Block Hot Paths + +Telemetry collection must never add latency to request handling: + +```go +// ❌ BAD — synchronous metrics in request handler +func (s *Service) RequestIPConfig(ctx context.Context, req Request) Response { + resp := s.allocateIP(req) + s.publishMetrics(ctx) // blocks on metrics publish! + return resp +} + +// ✅ GOOD — defer async recording +func (s *Service) RequestIPConfig(ctx context.Context, req Request) Response { + defer s.recorder.Record() // triggers async channel-based worker + return s.allocateIP(req) +} + +// Recorder uses sync.Once + channel worker pattern +type Recorder struct { + once sync.Once + ch chan struct{} +} + +func (r *Recorder) Record() { + r.once.Do(func() { go r.worker() }) + select { + case r.ch <- struct{}{}: + default: // drop if worker is busy — metrics are best-effort + } +} +``` + +## Fail Fast on Invariant Violations + +When internal state is inconsistent — like duplicate IPs in the pool — treat it as unrecoverable. Don't silently last-write-wins. + +```go +// ❌ BAD — silently overwrites duplicate, hides corruption +for _, ip := range ips { + state[ip.Address] = ip // duplicate? too bad, last write wins +} + +// ✅ GOOD — detect and fail immediately +var ErrDuplicateIP = errors.New("duplicate IP in state") + +for _, ip := range ips { + if _, exists := state[ip.Address]; exists { + return fmt.Errorf("%w: %s", ErrDuplicateIP, ip.Address) + } + state[ip.Address] = ip +} +``` + +If this state corruption is truly unrecoverable, crash CNS so it restarts cleanly rather than operating with corrupted state. A crash is observable (CrashLoopBackoff, alerts); silent corruption is invisible. + +## Decision Checklist + +Before adding a parameter, helper, or branch: + +| Question | If yes... | +| --- | --- | +| Does this function need to know the scenario name? | Pass a behavioral bool/enum instead | +| Does this function mutate a passed reference? | Return a value instead | +| Is this validator mutating or defaulting while it checks? | Return `bool`/result/`error`; keep mutation in a separate step | +| Can this batch mutation fail halfway through? | Preflight the whole batch first, then mutate | +| Does this helper save code? | If not, inline it | +| Does this constructor do anything? | If only `&Type{}`, delete it | +| Is nil used to mean "skip"? | Use an explicit flag | +| Does this low-level package import high-level config? | Pass only the needed values | +| Can this reconcile decision flap around a threshold? | Add enter/exit thresholds or sticky/in-flight state | + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| `if config.ChannelMode == "SwiftV2"` in helper | Pass `processPrimaryIP bool` | +| Mutating `*PodIpInfo` in-place via helper | Return `[]policy.Policy`, let caller assign | +| `validateReleaseRequest(req *IPConfigsRequest)` fills in NC IDs or IPs while checking | Return `error`/result only; do defaulting or building separately | +| `for _, ip := range ips { validateReleaseIP(ip); ReleaseIPs([]string{ip}) }` | Validate/build the full release plan first, then mutate | +| `NewClient()` that just does `return &Client{}` | Use `&Client{}` directly | +| Helper function larger than the inline code | Inline it | +| `if nnc == nil { /* different behavior */ }` | Validate nil as error, use explicit flags | +| `cnsConfig.ManageEndpointState && runtime.GOOS == "windows"` deep in stack | Resolve in main, pass result | +| `if freeIPs < targetFree { RequestIPs } else if freeIPs > targetFree { ReleaseIPs }` every reconcile | Use low/high watermarks and in-flight tracking | +| Creating a whole wrapper package for one function | Inline or use anonymous func | + +## Cross-References + +- → See `acn-go-interfaces-dependencies` skill for passing minimum info and dependency direction +- → See `acn-go-platform-abstraction` skill for OS-specific behavior boundaries +- → See `samber/cc-skills-golang@golang-design-patterns` for functional options, builders, and constructor patterns +- → See `samber/cc-skills-golang@golang-code-style` for function parameter and control flow rules diff --git a/.github/skills/acn-go-errors-logging/SKILL.md b/.github/skills/acn-go-errors-logging/SKILL.md new file mode 100644 index 0000000000..cb459b6c6f --- /dev/null +++ b/.github/skills/acn-go-errors-logging/SKILL.md @@ -0,0 +1,323 @@ +--- +name: acn-go-errors-logging +description: "Go error string formatting, zap logging discipline, and adjacent observability naming for production services. Use when writing error messages, choosing log levels, shaping zap messages and fields, deciding between log-and-return or single-handling, or reviewing error/log quality. Trigger on capitalized error strings, metadata stuffed into message prefixes, zap.Any used for known types, vague metric/config names, bare root logger usage, or logs that describe code instead of outcomes. Complements samber/cc-skills-golang@golang-error-handling with stronger opinions on conciseness, field discipline, and level discipline." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang. +metadata: + author: rbtr + version: "1.1.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go SRE who reads thousands of log lines daily. Every unnecessary word in an error string or log message is noise that hides signal. You optimize for the person debugging at 3am with `grep` and structured log queries. + +**Modes:** + +- **Write mode** — writing error handling and logging for new code. Apply single-handling rule, format errors as lowercase fragments, choose log levels deliberately, keep messages stable, and put runtime metadata in typed zap fields. +- **Review mode** — reviewing error/log quality in PR diffs. Flag capitalized errors, verbose logs, log-and-return violations, expanded acronyms, metadata hidden in message text, `zap.Any` for known types, vague metric/config names, and logs that describe code flow instead of outcomes. + +> **Complements** `samber/cc-skills-golang@golang-error-handling` (which covers error mechanics) and `samber/cc-skills-golang@golang-observability` (which covers observability infrastructure). This skill focuses on **error string quality, log conciseness, zap field discipline, and nearby metric/config naming that affects production debugging** specifically. + +# Go Error & Logging Discipline + +## Error String Rules + +Error strings are fragments, not sentences. They get wrapped and embedded in other errors and logs. Sentence-casing reads wrong when embedded. + +```go +// ❌ BAD — sentence-cased, reads wrong when wrapped +return errors.New("Connection string cannot be empty") +// produces: "failed to build appinsights client: Connection string cannot be empty" + +// ✅ GOOD — lowercase fragment, composes naturally +return errors.New("connection string cannot be empty") +// produces: "failed to build appinsights client: connection string cannot be empty" +``` + +### Error String Principles + +1. **Lowercase, no trailing punctuation** — errors are fragments that get composed with `fmt.Errorf("context: %w", err)` +2. **Concise** — describe what failed, not what to do about it +3. **Include the package name in sentinel errors** — `errors.New("apiclient: not found")` identifies origin +4. **Embed structured data in the error, not the string** — embed fields in custom error types or attach them as zap fields at the log site + +```go +// ❌ BAD — verbose, prescriptive +return fmt.Errorf("Failed to retrieve network container version list from NMAgent") + +// ✅ GOOD — concise, descriptive +return fmt.Errorf("failed to get nc version list from nmagent") +``` + +## Logging Philosophy + +**Logs need to be concise and information dense. More to read is actually _worse_ for readability.** + +### Don't Expand Well-Known Acronyms + +| ❌ Verbose | ✅ Concise | Why | +| --- | --- | --- | +| "IP Address" | "IP" | "Address" is redundant — it's in the "A" | +| "Network Container" | "NC" | Standard domain term, everyone knows it | +| "Failed to synchronize host network container versions with NMAgent" | "sync host nc versions error" | Shorter, same information | + +Writing out well-understood acronyms is not better. For logs, longer is not better. + +### Log What Happened, Not What the Code Does + +```go +// ❌ BAD — describes the code, not the outcome +logger.Debug("checking if primary IP should be processed") + +// ✅ GOOD — tells us what happened and what decision was made +logger.Debug("primary IP processing decision", + zap.Bool("processPrimary", processPrimary), + zap.Bool("overlayMode", overlayMode), +) +``` + +A log that merely restates the code is useless — the reader can read the code. Logs should tell you _what happened_ at runtime: what values, what decisions, what outcomes. + +Move logs inside or after conditionals to give information about the branch taken. + +### Metadata Belongs in Fields, Not Message Prefixes + +Keep the message about the event. Put variable metadata in fields so the message stays short, stable, and queryable. + +```go +// ❌ BAD — metadata buried in message text +logger.Info("pod=" + podName + " ncID=" + ncID + " hostNC=true nc refresh succeeded") + +// ✅ GOOD — stable message, structured metadata +logger.Info("nc refresh succeeded", + zap.String("pod", podName), + zap.String("ncID", ncID), + zap.Bool("hostNC", true), +) +``` + +Do not build pseudo-key/value prefixes into messages. They make aggregation, deduping, and filtering harder than a stable message plus fields. + +### Prefer Typed zap Fields Over `zap.Any` + +If the type is known, encode it with the matching zap helper. `zap.Any` is for genuinely dynamic payloads or short-lived debugging, not routine production logs. + +```go +// ❌ BAD — known types hidden behind zap.Any +logger.Debug("nc reconcile config", + zap.Any("retryCount", retryCount), + zap.Any("timeoutSeconds", cfg.TimeoutSeconds), + zap.Any("reconcileInterval", cfg.ReconcileInterval), +) + +// ✅ GOOD — preserve type information explicitly +logger.Debug("nc reconcile config", + zap.Int("retryCount", retryCount), + zap.Int("timeoutSeconds", cfg.TimeoutSeconds), + zap.Duration("reconcileInterval", cfg.ReconcileInterval), +) +``` + +Prefer `zap.String`, `zap.Int`, `zap.Bool`, `zap.Duration`, `zap.Error`, `zap.Stringer`, etc. when the type is known. + +### Derive Loggers, Don't Pass Root + +```go +// ❌ BAD — bare root logger, no context +func NewService(logger *zap.Logger) *Service { + return &Service{logger: logger} +} +// all logs say "msg=something" with no indication of source + +// ✅ GOOD — derived logger with identifying fields +func NewService(logger *zap.Logger) *Service { + return &Service{logger: logger.With(zap.String("component", "service"))} +} +``` + +### Don't Log Internal State on Hot Paths + +```go +// ❌ BAD — dumps entire state map on every request +log.Printf("current state: %+v", internalState) + +// ✅ GOOD — log at debug level, log only the relevant key +logger.Debug("ip config lookup", zap.String("ip", requestedIP)) +``` + +## Single-Handling Rule + +Errors MUST be either logged OR returned — **never both**. Duplicated log lines in aggregation services are noise that obscure the real signal. + +```go +// ❌ BAD — log AND return (duplicates in aggregation) +if err != nil { + logger.Error("failed to get NCs", zap.Error(err)) + return err +} + +// ✅ GOOD — return with context, let the caller decide +if err != nil { + return fmt.Errorf("getting NCs: %w", err) +} +``` + +The caller that ultimately handles the error can log it with full context from the error chain. + +## Log Level Discipline + +| Level | Use for | Stack traces? | +| --- | --- | --- | +| **Debug** | Internal state, decision points, useful context for troubleshooting | No | +| **Info** | Significant lifecycle events (started, stopped, config loaded) | No | +| **Warn** | Degraded but recoverable situations | No | +| **Error** | Failures requiring attention | Yes (Error+) | + +Stack traces at Warn are noise. Reserve them for Error and above where you actually need the call site to diagnose. + +```go +// ❌ BAD — stack at Warn level, not useful +logger.Warn("retrying connection", zap.Stack("stack")) + +// ✅ GOOD — stack only at Error +logger.Error("connection permanently failed", zap.Error(err)) // zap adds stack at Error+ +``` + +### Log Format Considerations + +- **stdout (human-readable):** logfmt is more readable than JSON for `kubectl logs` +- **Machine-consumed outputs:** JSON for log aggregation services (Datadog, Loki, etc.) +- **AI/telemetry sinks:** JSON for structured parsing + +## Observability Naming That Supports Debugging + +These naming rules belong here because operators correlate logs, errors, metrics, and config when debugging the same incident. This is not a general observability design guide. + +### Metric Names, Labels, and Help Must Be Explicit + +Use explicit, `snake_case` metric names and label keys, plus help text that says exactly what is being counted or measured. + +```go +// ❌ BAD — vague metric, vague label, vague help +refreshErrors := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "Errors", + Help: "Error count", +}, []string{"type"}) + +// ✅ GOOD — explicit, searchable, and consistent with the log/event name +refreshFailures := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "nmagent_nc_refresh_failures_total", + Help: "Total number of failed NMAgent network container refresh operations.", +}, []string{"operation", "failure_reason"}) +``` + +If the error log says `"nc refresh failed"`, the related metric should not be named `Errors`. Make the signal self-explanatory. + +### Config Names Must Carry Units + +When config is numeric, the name must tell the operator the unit. If the type is `time.Duration`, the name still needs a clear duration/interval meaning. + +```go +// ❌ BAD — unit is unclear +type RawNMAgentConfig struct { + RequestTimeout int `json:"requestTimeout"` + SyncPeriod int `json:"syncPeriod"` +} + +// ✅ GOOD — unit is obvious from the config name or type +type NMAgentConfig struct { + RequestTimeoutSeconds int `json:"requestTimeoutSeconds"` + ReconcileInterval time.Duration `json:"reconcileInterval"` +} +``` + +`timeout=30` is ambiguous. `timeoutSeconds=30` and `reconcileInterval=5s` are not. + +## Useful Debug Logs, Not Deleted Logs + +```go +// ❌ BAD — deleting logs entirely when they're noisy +// (just deleted the log lines) + +// ✅ GOOD — change the level instead +logger.Debug("host nc version response", zap.Int("count", len(versions))) +``` + +Most noisy logs are useful _debug_ logs. Change the level instead of deleting them. + +## Retry with Libraries, Not Hand-Rolled Loops + +```go +// ❌ BAD — hand-rolled retry with sleep +go func() { + for { + if err := w.Start(ctx); err != nil { + logger.Error("failed, will retry", zap.Error(err)) + time.Sleep(time.Minute) + continue + } + return + } +}() + +// ✅ GOOD — use retry library with backoff and context +go func() { + _ = retry.Do(func() error { + w, err := fsnotify.New(path, logger) + if err != nil { + return errors.Wrap(err, "failed to create watcher") + } + return w.Start(ctx) + }, retry.BackOffDelay, retry.Attempts(0), retry.Context(ctx)) +}() +``` + +Use `avast/retry-go` (already in the codebase) instead of hand-rolled retry loops. It provides exponential backoff, context cancellation, and attempt limits for free. + +### Closure-Returning Factory Functions for Retry + +When retry logic needs setup, return a closure that `retry.Do` can call: + +```go +func tryStopServiceFn(ctx context.Context, svc managedService) func() error { + return func() error { + status, err := svc.Query() + if err != nil { + return errors.Wrap(err, "could not query service") + } + if status.State == svc.Stopped { + return nil + } + _, err = svc.Control(svc.Stop) + return errors.Wrap(err, "could not stop service") + } +} + +// Usage: composable with retry +_ = retry.Do(tryStopServiceFn(ctx, service), retry.UntilSucceeded(), retry.Context(ctx)) +``` + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| `errors.New("Connection string cannot be empty")` | Lowercase: `"connection string cannot be empty"` | +| `"Failed to retrieve network container version list from NMAgent"` | Concise: `"failed to get nc version list from nmagent"` | +| Logging AND returning the same error | Pick one. Return with context, or log and handle | +| `logger.Warn("...", zap.Stack("stack"))` | Reserve stacks for Error+ | +| Passing bare root `*zap.Logger` | Derive with `.With(zap.String("component", ...))` | +| `logger.Info("pod=... ncID=... refresh succeeded")` | Keep the message stable and move metadata into zap fields | +| `zap.Any("reconcileInterval", cfg.ReconcileInterval)` | Use `zap.Duration(...)` or another typed field when the type is known | +| Metric `Name: "Errors", Help: "Error count", []string{"type"}` | Use explicit `snake_case` names/labels and specific help text, e.g. `nmagent_nc_refresh_failures_total`, `operation`, `failure_reason` | +| Config `RequestTimeout int` | Encode units: `RequestTimeoutSeconds int` or `RequestTimeout time.Duration` with a clear external format | +| Deleting noisy logs | Change level to Debug instead | +| Log that describes code flow | Log what happened: values, decisions, outcomes | +| Expanding "NC" to "Network Container" in logs | Well-known acronyms are fine and preferred for density | +| Hand-rolled `for { ... time.Sleep ... }` retry | Use `retry.Do` with backoff and context | + +## Cross-References + +- → See `samber/cc-skills-golang@golang-error-handling` for error creation, wrapping, `errors.Is`/`errors.As`, and panic/recover +- → See `samber/cc-skills-golang@golang-observability` for structured logging setup and middleware +- → See `acn-go-types-parsing` skill for embedding structured data in error types diff --git a/.github/skills/acn-go-http-api-contracts/SKILL.md b/.github/skills/acn-go-http-api-contracts/SKILL.md new file mode 100644 index 0000000000..94b973c765 --- /dev/null +++ b/.github/skills/acn-go-http-api-contracts/SKILL.md @@ -0,0 +1,191 @@ +--- +name: acn-go-http-api-contracts +description: "HTTP and REST contract design for Azure/azure-container-networking. Use when designing or reviewing HTTP handlers, routes, request/response types, query parameters, and action semantics. Trigger on GET handlers with bodies, PUT vs PATCH questions, action-heavy URL designs, redundant wrapper responses, or APIs that split a single resource operation across mismatched request shapes." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang with HTTP APIs. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go API reviewer who treats HTTP shape as contract design, not routing trivia. A resource path, method, and payload must describe the same operation. If the URL says one thing and the body says another, the API is wrong. + +**Modes:** + +- **Write mode** — designing new HTTP APIs or reshaping handlers. Prefer resource-shaped URLs, typed request/response bodies, and method semantics that match the operation. +- **Review mode** — reviewing handlers, route wiring, and request/response contracts. Flag GET bodies, action-shaped URLs, redundant wrapper responses, and operations whose semantics belong in a different method or path. +- **Audit mode** — auditing an HTTP surface. Split work across: (1) URL/resource shape, (2) method/idempotency, (3) request/response contract clarity. + +> **Repo-specific skill.** This captures repeated HTTP/API guidance from rbtr's ACN review history. + +# Go HTTP/API Contracts + +HTTP method, URL, query parameters, and body must all describe the same resource operation. Do not force callers to reverse-engineer your intent from mismatched pieces. + +## Use when + +- adding or reviewing HTTP handlers or routes +- choosing between GET / POST / PUT / PATCH +- designing request/response structs +- deciding whether something belongs in path, query, or body +- simplifying REST/controller handler surfaces + +## Do not use when + +- the change is internal-only and not exposed through HTTP +- the issue is only transport/client plumbing, not API contract shape + +## Core Principles + +1. **GET has no body** — use query parameters for filtering and selection. +2. **Choose PUT vs PATCH by semantics** — PUT replaces a resource representation; PATCH mutates part of it; POST performs creation or action semantics. +3. **Use resource-shaped URLs** — the path should identify the resource being operated on, not smuggle domain actions into arbitrary endpoints. +4. **Prefer HTTP status over extra app error-code layers** when the HTTP contract already expresses the state. +5. **One typed request/response surface per operation** — avoid redundant wrapper responses and mismatched body/path semantics. + +## No GET Bodies + +```go +// ❌ BAD — GET with body +GET /ibdevices +{ + "state": "assigned" +} + +// ✅ GOOD — GET with query filters +GET /ibdevices?state=assigned +GET /ibdevices?state=unassigned +GET /ibdevices?by-pod=namespace%2Fname +``` + +GET retrieves a representation. If the caller is selecting or filtering, use query parameters. + +## PUT vs PATCH vs POST + +Choose the method based on what the operation means: + +| Method | Use for | Example | +| --- | --- | --- | +| **GET** | read a resource or collection | `GET /ibdevices?state=assigned` | +| **PUT** | replace a full resource representation, idempotently | `PUT /nodes/{id}/status` | +| **PATCH** | partial mutation of an existing resource | `PATCH /pods/{id}` | +| **POST** | create a resource or invoke an action endpoint | `POST /ibdevices:assign?...` | + +```go +// ❌ BAD — POST on a resource path that does not mean create/replace +POST /ibdevices/{id} +{ + "pod": "ns/name" +} + +// ✅ GOOD — action is explicit when it is not whole-resource replacement +POST /ibdevices:assign?ibdevs=mac1,mac2&pod=ns%2Fname +``` + +If POSTing to `/resource/{id}` does **not** create or replace the whole resource, the API is lying about its contract. + +## Resource-Shaped URLs + +The path should identify the thing being operated on. + +```go +// ❌ BAD — path and payload describe different resources +POST /ibdevices/{mac} +{ + "pod": "ns/name", + "devices": ["mac1", "mac2"] +} + +// ✅ GOOD — collection plus filter/action surfaces +GET /ibdevices/{mac} +GET /ibdevices?state=assigned +GET /ibdevices?by-pod=ns%2Fname +POST /ibdevices:assign?ibdevs=mac1,mac2&pod=ns%2Fname +``` + +If the body is really about a mapping between resources, model the mapping explicitly. Do not pretend the request is “about” one thing while the payload mutates another. + +## Path vs Query vs Body + +Use each part of the request for what it is good at: + +- **Path** — identifies the resource +- **Query** — selects, filters, or scopes the operation +- **Body** — supplies representation or mutation payload when the method semantics require one + +```go +// ✅ GOOD — query carries the selection set +POST /ibdevices:assign?ibdevs=mac1,mac2&pod=ns%2Fname + +// ✅ GOOD — body carries the representation being replaced +PUT /nodenetworkconfigs/{name} +{ + "spec": { ... } +} +``` + +Do not require a body when the input is just selectors that fit naturally in query params. + +## Prefer HTTP Status Over Redundant App Error Layers + +When HTTP already expresses the error class, use it. + +```go +// ❌ BAD — HTTP 200 plus app-layer errorCode for a missing resource +{ + "errorCode": "DeviceNotFound", + "message": "device not found" +} + +// ✅ GOOD — HTTP status carries the contract +HTTP/1.1 404 Not Found +{ + "message": "device not found" +} +``` + +`409 Conflict`, `404 Not Found`, `400 Bad Request`, and friends already communicate most API contract failures. Avoid layering an extra parallel contract unless it is truly required across transports. + +## Simplify Handler Surfaces + +Handlers should read like: decode → validate → operate → encode. + +```go +// ✅ GOOD — typed request/response surface per operation +type AssignIBDevicesRequest struct { + Pod string `json:"pod"` + Devices []string `json:"devices"` +} + +func (s *Server) assignIBDevices(w http.ResponseWriter, r *http.Request) { + var req AssignIBDevicesRequest + if err := decode(r, &req); err != nil { + http.Error(w, "invalid request", http.StatusBadRequest) + return + } + // validate -> operate -> encode +} +``` + +Avoid broad pass-through wrappers or responses that exist only to restate HTTP status in another format. + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| GET request with a body | Use query parameters | +| POST on `/resource/{id}` for a non-create/non-replace action | Use action endpoint or the correct method | +| URL path identifies one resource while body mutates another | Model the actual resource/mapping explicitly | +| Query-worthy selectors placed in JSON body | Put selectors in query params | +| Returning HTTP 200 with app-layer not-found/conflict code | Use HTTP 404/409/etc. | +| Handler response wrapped only to repeat status | Return one typed response shape or plain HTTP error | +| Splitting one logical operation across mismatched path/body shapes | Design one coherent typed request/response surface | + +## Cross-References + +- → See `acn-go-control-plane-contracts` for typed public response/status contracts and CRD/API surface hygiene +- → See `acn-go-interfaces-dependencies` for simplifying handler surfaces and migration shims +- → See `acn-go-errors-logging` for boundary error handling and concise API-facing messages +- → See `acn-go-design-boundaries` for keeping business behavior out of transport-specific plumbing diff --git a/.github/skills/acn-go-interfaces-dependencies/SKILL.md b/.github/skills/acn-go-interfaces-dependencies/SKILL.md new file mode 100644 index 0000000000..395884bfc3 --- /dev/null +++ b/.github/skills/acn-go-interfaces-dependencies/SKILL.md @@ -0,0 +1,293 @@ +--- +name: acn-go-interfaces-dependencies +description: "Go interface design, dependency direction, and package coupling patterns. Use when defining interfaces, choosing where to place them, designing constructors, injecting dependencies, or reviewing import graphs. Trigger on exported interfaces in provider packages, interfaces matching a single type, backwards imports (server importing client), or functions accepting entire config structs when they need one field. Supersedes generic Go interface guidance with stronger opinions on consumer-side interfaces and coupling." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go dependency architect. You believe interfaces belong to consumers, not providers. Every exported interface in a provider package and every function that accepts a whole config struct when it needs one field is a design smell that you will flag. + +**Modes:** + +- **Write mode** — designing new interfaces and package boundaries. Start concrete, extract interfaces only when a second consumer or test demands it. +- **Review mode** — reviewing PR diffs for coupling and interface violations. Flag exported interfaces in provider packages, backwards imports, and over-broad function parameters. +- **Audit mode** — auditing dependency graph health. Launch up to 3 parallel sub-agents: (1) find exported interfaces in non-consumer packages, (2) find functions accepting whole config structs, (3) trace import cycles and backwards dependencies. + +> **Supersedes** `samber/cc-skills-golang@golang-structs-interfaces` on interface placement and coupling. The samber skill covers interface mechanics; this skill covers interface _strategy_. + +# Go Interface Design & Dependency Direction + +> "If something can do _this_, it can be used _here_." — Effective Go + +Interfaces in Go are defined by the consumer, not the provider, to describe the behavior that an object must have to be passed in to them. + +## Core Principles + +1. **Interfaces belong to consumers** — define the interface where you _use_ it, not where you implement it. The consumer controls the contract. +2. **Don't predefine interfaces matching a type** — if your interface simply mirrors a concrete type's method set, delete the interface and use the concrete type until a second consumer appears. +3. **Accept interfaces, return structs** — always. Never return an interface from a constructor. +4. **Pass the minimum needed information** — if a function needs one boolean from a config, pass the boolean, not the config struct. Don't let low-level code depend on high-level config types. +5. **No backwards imports** — servers don't import clients. CNS doesn't import CNI. Higher-level code imports lower-level code, never the reverse. + +## Consumer-Defined Interfaces + +```go +// ❌ BAD — interface defined in the provider package +// package cns/service +type IPAMStateReconciler interface { + ReconcileIPAMState(ncs []v1alpha.NetworkContainer, nnc *v1alpha.NodeNetworkConfig) types.ResponseCode +} + +// ✅ GOOD — interface defined where consumed +// package nodesubnet (the consumer) +type ipamReconciler interface { + ReconcileIPAMState(ncs []v1alpha.NetworkContainer, nnc *v1alpha.NodeNetworkConfig) types.ResponseCode +} + +type Manager struct { + reconciler ipamReconciler // accepts anything with this method +} +``` + +It usually doesn't make sense to have a public interface in a separate package. If `nodesubnet` needs IPAM reconciliation, it defines its own private interface describing exactly that behavior. Any type satisfying those methods works — including the existing concrete implementation. + +Reference: [Preemptive Interface Anti-Pattern in Go](https://medium.com/@cep21/preemptive-interface-anti-pattern-in-go-54c18ac0668a) + +## Pass Minimum Information + +```go +// ❌ BAD — function depends on entire CNS config +func createNCRequest(cnsConfig *configuration.CNSConfig, nc NetworkContainer) Request { + if cnsConfig.ChannelMode == "swift_v2" { // leaks scenario knowledge + // ... + } +} + +// ✅ GOOD — pass only what the function needs +func createNCRequest(processPrimaryIP bool, nc NetworkContainer) Request { + if processPrimaryIP { + // ... + } +} +``` + +The caller already knows the scenario. The callee only needs to know the behavior. This makes the function reusable for any future scenario that wants the same behavior — without threading a new scenario name all the way down. + +## Simplify Handler Surfaces + +Handlers should expose one typed request/response surface per operation. Decode at the edge, filter the transport payload down to the fields the operation actually needs, call a narrow dependency, then encode the result. Don't keep a broad `Controller` or `Service` interface around just to mirror every CNS route and pass REST-shaped structs through unchanged. + +```go +// ❌ BAD — pass-through controller that just mirrors the REST table +type ipamController interface { + RequestIPConfig(context.Context, cns.IPConfigRequest) (*cns.IPConfigResponse, error) + RequestIPConfigs(context.Context, cns.IPConfigsRequest) (*cns.IPConfigsResponse, error) + ReleaseIPConfig(context.Context, cns.IPConfigsRequest) (*cns.IPConfigsResponse, error) +} + +func (service *HTTPRestService) RequestIPConfigHandler(w http.ResponseWriter, r *http.Request) { + var req cns.IPConfigRequest + if err := common.Decode(w, r, &req); err != nil { + return + } + + resp, err := service.controller.RequestIPConfig(r.Context(), req) + writeIPConfigResponse(w, resp, err) +} +``` + +```go +// ✅ GOOD — handler is decode-filter-encode over one operation surface +type requestIPConfigs interface { + Run(context.Context, RequestIPConfigsInput) (RequestIPConfigsOutput, error) +} + +type RequestIPConfigsInput struct { + PodInterfaceID string + InfraContainerID string + Ifname string + DesiredIPAddresses []string +} + +type RequestIPConfigsOutput struct { + Response cns.Response + PodIPInfo []cns.PodIpInfo +} + +func (service *HTTPRestService) RequestIPConfigHandler(w http.ResponseWriter, r *http.Request) { + var req cns.IPConfigRequest + if err := common.Decode(w, r, &req); err != nil { + return + } + + input := RequestIPConfigsInput{ + PodInterfaceID: req.PodInterfaceID, + InfraContainerID: req.InfraContainerID, + Ifname: req.Ifname, + } + if req.DesiredIPAddress != "" { + input.DesiredIPAddresses = []string{req.DesiredIPAddress} + } + + out, err := service.requestIPConfigs.Run(r.Context(), input) + writeIPConfigResponse(w, &cns.IPConfigResponse{ + Response: out.Response, + PodIpInfo: out.PodIPInfo[0], + }, err) +} +``` + +`RequestIPConfigHandler` can act as a migration adapter from the older one-IP REST surface to a richer multi-IP operation. That's fine. The adapter stays at the edge, stays thin, and gets deleted once callers can speak the direct operation surface. Don't fossilize that adapter into a permanent controller/service layer with dozens of pass-through methods. + +## Import Direction + +``` +✅ GOOD dependency direction: +main → cns/service → cns/types +main → cni/network → cns/types (shared types) + +❌ BAD — backwards: +cns/service → cni/network (server imports client) +cns/restserver → crd/... (core imports extension) +``` + +If you find yourself needing types from a "sibling" package, extract shared types to a lower-level package that both can import. + +## The stdlib http.Client Pattern + +The standard library's `http.Client` is a concrete type. It does not implement a predefined interface. Consumers define their own: + +```go +// Your package defines only what it needs +type httpDoer interface { + Do(req *http.Request) (*http.Response, error) +} + +func registerNode(ctx context.Context, client httpDoer, endpoint string) error { + // works with *http.Client, or any mock +} +``` + +This is the model. Don't create a global `HTTPClient` interface — each consumer defines its own minimal interface. + +## Constructor Design + +```go +// ❌ BAD — constructor for struct with no initialization logic +func NewIPTablesClient() *Client { + return &Client{} +} +// Just use: c := &iptables.Client{} + +// ✅ GOOD — constructor when initialization is needed (Poka-yoke) +func NewService(logger *zap.Logger, store UserStore) *Service { + return &Service{logger: logger, store: store} +} +// Constructor prevents nil logger NPEs — can't forget required deps +``` + +If a struct's zero value is valid, a constructor is unnecessary boilerplate. Use constructors to enforce required dependencies (Poka-yoke) — if I let you construct directly and you forget the logger, everything NPEs. + +## Function Types Over Single-Method Interfaces + +When a dependency has only one method, a function type is lighter than an interface — no type to define, no struct to wrap, and it composes naturally with closures and method references. + +```go +// ❌ BAD — interface for a single method +type nodeNetworkConfigListener interface { + Update(*v1alpha.NodeNetworkConfig) error +} + +type Reconciler struct { + listener nodeNetworkConfigListener +} + +// ✅ GOOD — function type, lighter and more composable +type nodenetworkconfigSink func(*v1alpha.NodeNetworkConfig) error + +type Reconciler struct { + sink nodenetworkconfigSink +} + +// Construction: pass the method reference directly +r := NewReconciler(poolMonitor.Update) // .Update satisfies the func type + +// Or wrap with a closure for extra behavior: +initializer := func(nnc *v1alpha.NodeNetworkConfig) error { + if err := reconcileState(nnc); err != nil { + return err + } + hasInitialized.Set(1) + return nil +} +r := NewReconciler(initializer) +``` + +Function types also make testing trivial — no mock struct needed: + +```go +r := NewReconciler(func(nnc *v1alpha.NodeNetworkConfig) error { + return nil // test stub +}) +``` + +**When to use function types vs interfaces:** +- **1 method** → function type +- **2-3 methods that always travel together** → small interface +- **Methods with shared state** → interface backed by a struct + +## Adapter Pattern for Incremental Migration + +When replacing a subsystem (v1 → v2), use an adapter to bridge the new implementation back to existing consumers. This avoids flag-day migrations: + +```go +// v2 Monitor has a completely different design (channel-driven, no interface) +// Adapter bridges it back to the v1 interface so existing code doesn't change +type v2Adapter struct { + monitor *v2.Monitor +} + +func (a *v2Adapter) Update(nnc *v1alpha.NodeNetworkConfig) error { + a.monitor.Push(nnc) // adapts interface call to channel push + return nil +} +``` + +The adapter can be deleted later when consuming code migrates to the v2 design directly. + +## When NOT to Interface + +- **One implementation, no tests needing mocks** → use the concrete type +- **Single method** → use a function type instead +- **The interface just mirrors the type** → delete it, use the type +- **You're creating it "for the future"** → YAGNI, extract when needed +- **It's in the provider package** → move it to the consumer + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| Exported interface in provider package | Move to consumer, make private | +| Interface matching single concrete type | Delete interface, use concrete type | +| Returning interface from constructor | Return concrete type | +| Passing `*CNSConfig` to low-level function | Pass the specific bool/string needed | +| Server importing client package | Extract shared types to common package | +| Global `HTTPClient` interface | Each consumer defines its own `httpDoer` | +| Constructor for zero-value struct | Use `&Type{}` directly | +| Creating interface before second implementation | Start concrete, extract when needed | +| Single-method interface | Use a function type instead | +| Flag-day migration to new subsystem | Use adapter to bridge v2 back to v1 consumers | +| Controller/service interface mirrors REST routes | Give each operation one typed request/response surface | +| Adapter became permanent architecture | Keep it as a thin migration shim, then delete it | + +## Cross-References + +- → See `acn-go-design-boundaries` skill for behavioral config vs scenario config (closely related to pass-minimum-info) +- → See `acn-go-platform-abstraction` skill for OS-specific interface patterns +- → See `samber/cc-skills-golang@golang-structs-interfaces` for interface mechanics (type assertions, embedding, receivers) +- → See `samber/cc-skills-golang@golang-design-patterns` for functional options and constructor patterns diff --git a/.github/skills/acn-go-platform-abstraction/SKILL.md b/.github/skills/acn-go-platform-abstraction/SKILL.md new file mode 100644 index 0000000000..97cb554ea1 --- /dev/null +++ b/.github/skills/acn-go-platform-abstraction/SKILL.md @@ -0,0 +1,163 @@ +--- +name: acn-go-platform-abstraction +description: "Go platform/OS-specific code patterns for cross-platform projects. Use when writing or reviewing code that must work on multiple OSes (Linux, Windows), when you see noop stubs in _.go files, runtime.GOOS checks, or OS-specific build tags. Supersedes generic Go guidance on build tags and OS-specific code. Trigger on any code that branches by operating system." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang with cross-platform (Linux/Windows) support. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go platform engineer who believes that if you need a noop stub, your abstraction is in the wrong place. OS-specific code must be invisible to the common code paths — the `if` moves out of the code and into the file system via `_.go` files. + +**Modes:** + +- **Write mode** — implementing cross-platform functionality. Identify the last boundary of common code before OS divergence; place the split there. +- **Review mode** — reviewing code for platform abstraction issues. Flag noop stubs, runtime.GOOS checks, and scenario-named OS code. Sequential. +- **Audit mode** — auditing a codebase for platform abstraction violations. Launch up to 3 parallel sub-agents: (1) find noop stubs in `_.go` files, (2) find `runtime.GOOS` checks in non-test code, (3) find OS-specific imports in common files. + +> **Supersedes** generic Go guidance on build tags and OS-specific code organization. + +# Go Platform Abstraction + +Noop implementations to make the compiler happy indicate that the OS abstraction is **wrong**. Once you get to the point that you need to execute OS-specific behavior, you are _already_ in OS-specific code, and _that's_ what needs to be separated into `_.go` files. + +## Core Principles + +1. **Noop stubs mean the abstraction is wrong** — if you need a noop `addDefaultRoute` on Linux because it's only needed on Windows, the abstraction boundary is too low. Move it up to where the calling code diverges. +2. **Never check `runtime.GOOS`** — any explicit GOOS check or noop stubbing of an interface in a `*_.go` file indicates the abstraction is in the wrong place. +3. **Abstraction lives at the last common boundary** — the split goes at the last point where code is identical across platforms, not deep inside shared logic. +4. **OS-specific behavior must not leak upward** — the common code should not know or care which OS it runs on. No scenario names, no OS flags, no conditional stubs. +5. **Platform defaults live in `config_.go`** — don't hardcode paths or values that differ by OS in common code. + +## The Wrong Way: Noop Stubs + +```go +// ❌ BAD — k8sSwiftV2_linux.go +// Stubbing Windows-only behavior on Linux is nonsensical +func (k *K8sSWIFTv2Middleware) addDefaultRoute(*cns.PodIpInfo, string) {} + +// ❌ BAD — hnsclient_linux.go +// "hnsclient_linux" is nonsensical — HNS is a Windows concept +type linuxHNSClient struct{} +func (l *linuxHNSClient) DeleteEndpoint(id string) error { return nil } +``` + +This forces the reader to mentally track: "if GOOS=linux then this method call is noop." The `if` has moved out of code and into your head — that's worse, not better. + +## The Right Way: Split at the Boundary + +Find the last point of common code before OS divergence. That function gets OS-specific implementations: + +```go +// ❌ BAD — common code calls OS-specific stubs +func processEndpoint(info *EndpointInfo) error { + // ... common logic ... + if err := addDefaultRoute(info); err != nil { // noop on Linux! + return err + } + if err := configureHNS(info); err != nil { // noop on Linux! + return err + } + return nil +} + +// ✅ GOOD — split at the last common boundary +// endpoint.go (common) +func processEndpoint(info *EndpointInfo) error { + // ... common logic ... + return configurePlatformNetworking(info) +} + +// endpoint_linux.go +func configurePlatformNetworking(info *EndpointInfo) error { + return nil // nothing platform-specific needed +} + +// endpoint_windows.go +func configurePlatformNetworking(info *EndpointInfo) error { + if err := addDefaultRoute(info); err != nil { + return err + } + return configureHNS(info) +} +``` + +The key insight: `addDefaultRoute` shouldn't exist in the common abstraction at all. The question is "what platform-specific networking setup is needed?" — and on Linux, the answer is "nothing." + +## Decision Framework + +When you encounter OS-specific behavior: + +1. **Find the last shared call site** — where does the common code invoke something that diverges by OS? +2. **That call site is your abstraction boundary** — create `_linux.go` and `_windows.go` versions of that function. +3. **Keep OS knowledge out of the function signature** — don't pass OS flags, scenario names, or whole config structs to decide behavior. +4. **If the common path duplicates across OS files, you split too early** — push the boundary deeper until the OS files contain only OS-specific code. + +| Signal | Problem | Fix | +| --- | --- | --- | +| Noop stub in `_.go` | Abstraction too low | Move boundary up to caller | +| `runtime.GOOS` check | No abstraction at all | Extract to `_.go` files | +| OS name in struct/func name | Wrong abstraction | Name the behavior, not the OS | +| Stub returns `nil` error always | Interface too broad | Narrow interface or eliminate | +| Common code imports OS-specific pkg | Wrong file placement | Move import to `_.go` file | + +## Platform Configuration + +```go +// ❌ BAD — hardcoded paths with OS conditionals +const statePath = "/var/run/azure-cns" + +// ✅ GOOD — config_.go files +// config_linux.go +const defaultStatePath = "/var/run/azure-cns" + +// config_windows.go +const defaultStatePath = `C:\ProgramData\azure-cns` + +// config.go (common) +func statePath(configured string) string { + if configured != "" { + return configured + } + return defaultStatePath +} +``` + +## Prefer Native OS APIs Over Shell-Outs + +```go +// ❌ BAD — shelling out to PowerShell for registry/service operations +func setRegistryValue(key, value string) error { + cmd := exec.Command("powershell", "-Command", + fmt.Sprintf(`Set-ItemProperty -Path "%s" -Name "%s" -Value "%s"`, path, key, value)) + return cmd.Run() +} + +// ✅ GOOD — native Go Windows APIs +func setRegistryValue(key registry.Key, name string, value uint32) error { + return key.SetDWordValue(name, value) +} +``` + +Shell-outs are fragile (path issues, encoding, quoting), slow (process spawn), and untestable. Use `golang.org/x/sys/windows/registry`, `golang.org/x/sys/windows/svc`, etc. These are type-safe, testable with mocks, and context-aware. + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| `func newLinuxRegistryClient` returning `windowsRegistryClient` interface | Name the behavior, not the OS — this is nonsensical | +| Stubbing Windows-only behavior in Linux structs | The calling code is OS-specific — split there instead | +| `hnsclient_linux.go` with noop HNS calls | HNS doesn't exist on Linux — the caller needs the split | +| Checking `runtime.GOOS` to branch behavior | Use `_.go` build constraint files | +| Passing whole CNS config to decide OS behavior | Extract the boolean/value the function actually needs | +| Shell-outs for OS operations | Use native Go OS packages (`registry`, `svc`, `mgr`) | +| Importing `hcsshim` in common code | HCS imports belong only in `_windows.go` files | + +## Cross-References + +- → See `acn-go-design-boundaries` skill for behavioral config vs scenario config +- → See `acn-go-interfaces-dependencies` skill for interface design at the consumer +- → See `samber/cc-skills-golang@golang-code-style` for general file organization diff --git a/.github/skills/acn-go-types-parsing/SKILL.md b/.github/skills/acn-go-types-parsing/SKILL.md new file mode 100644 index 0000000000..fd52c1d2f9 --- /dev/null +++ b/.github/skills/acn-go-types-parsing/SKILL.md @@ -0,0 +1,274 @@ +--- +name: acn-go-types-parsing +description: "Go type safety, standard library type preferences, and the 'parse don't validate' principle. Use when handling IP addresses, MAC addresses, network types, or any string-typed domain data. Trigger on net.IP usage (prefer netip.Addr), string MAC addresses (prefer net.HardwareAddr), stringly-typed variables that should be custom types, or reflect.DeepEqual where == suffices. Also trigger on 'parse don't validate' discussions or type conversion at boundaries." +user-invocable: true +license: MIT +compatibility: Designed for Claude Code or similar AI coding agents, and for projects using Golang. +metadata: + author: rbtr + version: "1.0.0" +allowed-tools: Read Edit Write Glob Grep Bash(go:*) Bash(golangci-lint:*) Bash(git:*) Agent +--- + +**Persona:** You are a Go type system advocate. You believe that if data passes through your code as a `string` when a richer type exists, every consumer must re-parse and re-validate it — multiplying bugs. Parse once at the boundary, use types everywhere else. + +**Modes:** + +- **Write mode** — writing new code that handles network types, identifiers, or domain data. Parse to types at the entry point, use typed values throughout. +- **Review mode** — reviewing code for type safety. Flag string IPs, string MACs, stringly-typed config, and unnecessary reflection. Sequential. +- **Audit mode** — auditing type safety across a codebase. Launch up to 3 parallel sub-agents: (1) find `net.IP` usage that should be `netip.Addr`, (2) find string MAC addresses, (3) find stringly-typed domain identifiers. + +> **Complements** `samber/cc-skills-golang@golang-safety` with domain-specific type guidance for network services. + +# Go Type Safety: Parse, Don't Validate + +> [Parse, don't validate](https://lexi-lambda.github.io/blog/2019/11/05/parse-don-t-validate/) — convert data to types at the boundary so that the rest of your code cannot represent invalid states. + +## Core Principle + +Convert raw data (strings, bytes) to typed values at the system boundary. Once parsed, the type carries the validation with it — downstream code cannot accidentally misuse it. + +```go +// ❌ BAD — string IP passed through multiple functions, re-parsed each time +func configureEndpoint(ipStr string) error { + // every consumer must validate: is this v4? v6? valid at all? + ip := net.ParseIP(ipStr) + if ip == nil { + return fmt.Errorf("invalid IP: %s", ipStr) + } + // ... +} + +// ✅ GOOD — parse at boundary, use typed value everywhere +func configureEndpoint(addr netip.Addr) error { + // addr is guaranteed valid, can query .Is4()/.Is6() directly + if addr.Is6() { + return configureV6(addr) + } + return configureV4(addr) +} +``` + +## Prefer `netip` Over `net` for IP Handling + +The `net/netip` package (Go 1.18+) provides value types that are comparable, more efficient, and safer than the `net` package equivalents. + +| Old (`net`) | New (`netip`) | Why prefer `netip` | +| --- | --- | --- | +| `net.IP` | `netip.Addr` | Value type, comparable with `==`, `.Is4()`/`.Is6()` methods | +| `net.IPNet` | `netip.Prefix` | Value type, `.Overlaps()` method, no nil footgun | +| `net.ParseIP()` | `netip.ParseAddr()` | Returns error (not nil), forces error handling | +| `net.ParseCIDR()` | `netip.ParsePrefix()` | Returns error, cleaner API | + +```go +// ❌ BAD — old net package, nil return on error +ip := net.ParseIP(ipStr) +if ip == nil { // easy to forget this check + // ... +} + +// ✅ GOOD — netip returns error, forces handling +addr, err := netip.ParseAddr(ipStr) +if err != nil { + return fmt.Errorf("parsing IP %q: %w", ipStr, err) +} +// addr.Is4(), addr.Is6() — no ambiguity +``` + +**Conversion at boundaries:** You will have to convert at existing type boundaries (APIs, structs using `net.IP`), but everything in between gets improved. + +### Subnet Overlap with `netip.Prefix` + +```go +// ✅ Use Prefix.Overlaps() instead of manual bitmask comparison +subnet1, _ := netip.ParsePrefix("10.0.0.0/8") +subnet2, _ := netip.ParsePrefix("10.1.0.0/16") +if subnet1.Overlaps(subnet2) { + // subnet1 fully contains subnet2 +} +``` + +`Overlaps` normalizes to the network address even if the CIDR uses a different base address, so it is sufficient alone. + +## Use `net.HardwareAddr` for MAC Addresses + +```go +// ❌ BAD — MAC as string, format inconsistencies everywhere +type Device struct { + MAC string // "00:11:22:33:44:55" or "00-11-22-33-44-55" or "001122334455"? +} + +// ✅ GOOD — stdlib type, consistent representation +type Device struct { + MAC net.HardwareAddr +} +``` + +There is [a stdlib MAC type](https://pkg.go.dev/net#HardwareAddr). Use it. If the source is a `net.HardwareAddr` already, you don't have to parse it again — compare it with `bytes.Equal(a, b)` rather than `==` because `net.HardwareAddr` is a `[]byte` slice type. Comparing `string(a)`/`string(b)` or `a.String()` values can work, but be explicit about the normalization and formatting tradeoffs. If MAC comparison rules matter in multiple places, centralize them in a helper such as `EqualMAC(a, b net.HardwareAddr) bool`. Consolidate any special handling of MAC parsing into a single place. + +## Custom Types Over String Typing + +```go +// ❌ BAD — stringly typed, easy to mix up +func CreateNC(channelMode string, ncID string, nodeID string) error { + // which string is which? typos compile fine +} + +// ✅ GOOD — custom types prevent misuse +type ChannelMode string +const ( + ChannelModeDirect ChannelMode = "direct" + ChannelModeManaged ChannelMode = "managed" + ChannelModeCRD ChannelMode = "crd" +) + +type NCID string +type NodeID string + +func CreateNC(mode ChannelMode, ncID NCID, nodeID NodeID) error { + // compiler prevents passing nodeID where ncID is expected +} +``` + +Stop kicking the can down the road — fix stringly-typed variables instead of adding more special handling for them. + +## Struct Comparison + +```go +// ❌ BAD — reflect.DeepEqual or cmp.Equal for comparable structs +if reflect.DeepEqual(oldState, newState) { ... } + +// ✅ GOOD — if all fields are comparable, use == +if oldState == newState { ... } +``` + +From the [Go spec](https://go.dev/ref/spec#Comparison_operators): "Struct types are comparable if all their field types are comparable." + +**Never use `cmp.Equal` in production code.** From its [documentation](https://pkg.go.dev/github.com/google/go-cmp/cmp): "It is intended to only be used in tests, as performance is not a goal and it may panic if it cannot compare the values." + +## Centralize Formatters + +When an identifier has a canonical string format, implement it as a method on the type: + +```go +// ❌ BAD — magic key prefixes scattered across codebase +key := fmt.Sprintf("NIC_%s_%s", deviceType, uid) +// ... elsewhere: +key := fmt.Sprintf("NIC-%s-%s", deviceType, uid) // inconsistent! + +// ✅ GOOD — centralized as String()/Key()/ID() method +func (d Device) Key() string { + return fmt.Sprintf("NIC_%s_%s", d.Type, d.UID) +} +``` + +These formatters MUST be centralized as `Key()`, `ID()`, or `String()` methods on the type so that they are _always_ used consistently. + +## Typed State Enums and Predicate Filters + +Define state values as typed string constants, then build reusable predicate functions. The same rule applies to public contract surfaces such as `Status`, `Condition`, `Reason`, and `ResponseCode` — if callers can observe it, model it as a named type with shared constants and helpers. + +```go +// ❌ BAD — bare strings for state, ad-hoc filtering everywhere +if state == "Available" || state == "PendingProgramming" { + // ... +} + +// ✅ GOOD — typed enum + predicate function +type IPConfigState string + +const ( + Available IPConfigState = "Available" + Allocated IPConfigState = "Allocated" + PendingRelease IPConfigState = "PendingRelease" + PendingProgramming IPConfigState = "PendingProgramming" +) + +// Reusable filter predicate +func IsAvailableForAllocation(state IPConfigState) bool { + return state == Available || state == PendingProgramming +} + +// Used consistently across codebase +for _, ip := range pool { + if IsAvailableForAllocation(ip.State) { + available = append(available, ip) + } +} +``` + +This moves state semantics into shared types. Consumers use predicates instead of repeating conditions with bare strings. + +For public contract fields, typed constants and predicate helpers are mandatory. Never scatter raw `"Failed"`, `"Timeout"`, or `4091` literals through handlers, and never infer a public `Reason` or `ResponseCode` by matching `err.Error()`. Map internal errors to typed public values once with `errors.Is`/`errors.As`, then return that typed surface everywhere else. + +```go +type Status string +type Condition string +type Reason string +type ResponseCode int + +const ( + StatusSucceeded Status = "Succeeded" + StatusFailed Status = "Failed" + + ConditionReady Condition = "Ready" + ConditionDegraded Condition = "Degraded" + + ReasonNone Reason = "None" + ReasonInvalidInput Reason = "InvalidInput" + + ResponseCodeOK ResponseCode = 0 + ResponseCodeInvalidInput ResponseCode = 4001 +) + +func IsFailureStatus(status Status) bool { + return status == StatusFailed +} + +type PublicResponse struct { + Status Status + Condition Condition + Reason Reason + ResponseCode ResponseCode +} + +func mapPublicResponse(err error) PublicResponse { + switch { + case errors.Is(err, ErrInvalidIP): + return PublicResponse{ + Status: StatusFailed, + Condition: ConditionDegraded, + Reason: ReasonInvalidInput, + ResponseCode: ResponseCodeInvalidInput, + } + default: + return PublicResponse{ + Status: StatusSucceeded, + Condition: ConditionReady, + Reason: ReasonNone, + ResponseCode: ResponseCodeOK, + } + } +} +``` + +## Common Mistakes + +| Mistake | Fix | +| --- | --- | +| `net.IP` in new code | Use `netip.Addr` — value type, comparable, returns errors | +| `net.ParseIP` (returns nil) | Use `netip.ParseAddr` (returns error) | +| String MAC addresses | Use `net.HardwareAddr` | +| `channelMode string` parameter | Define `type ChannelMode string` with const values | +| `reflect.DeepEqual` on comparable structs | Use `==` directly | +| `cmp.Equal` in production | Only in tests — it panics by design | +| Format string duplicated in multiple places | Centralize as `Key()`/`ID()`/`String()` method | +| Re-parsing IP string in every function | Parse at boundary, pass `netip.Addr` through | +| Public `Status`/`Condition`/`Reason`/`ResponseCode` handled as raw literals | Define typed constants + predicate helpers, then map internal errors with `errors.Is`/`errors.As` instead of `err.Error()` matching | + +## Cross-References + +- → See `samber/cc-skills-golang@golang-safety` for nil interface trap and numeric safety +- → See `samber/cc-skills-golang@golang-naming` for type and constant naming conventions +- → See `acn-go-errors-logging` skill for embedding structured data in errors +- → See `acn-go-interfaces-dependencies` skill for type design at package boundaries diff --git a/.github/workflows/baseimages.yaml b/.github/workflows/baseimages.yaml index 2340f012af..43d35b4f80 100644 --- a/.github/workflows/baseimages.yaml +++ b/.github/workflows/baseimages.yaml @@ -11,14 +11,18 @@ on: types: - checks_requested +permissions: + contents: read + jobs: render: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 - - uses: actions/setup-go@v6 + persist-credentials: false + - uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod - name: go env diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml index 7924ec38c1..2cb186a5bc 100644 --- a/.github/workflows/codeql.yaml +++ b/.github/workflows/codeql.yaml @@ -33,19 +33,22 @@ jobs: security-events: write steps: - name: Checkout repository - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Setup go - uses: actions/setup-go@v6 + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod + cache: false - name: Initialize CodeQL - uses: github/codeql-action/init@v4 + uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 with: languages: ${{ matrix.language }} queries: ./codeql/ - name: Autobuild - uses: github/codeql-action/autobuild@v4 + uses: github/codeql-action/autobuild@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v4 + uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/crdgen.yaml b/.github/workflows/crdgen.yaml index 208fbaa46a..9f44dd23c2 100644 --- a/.github/workflows/crdgen.yaml +++ b/.github/workflows/crdgen.yaml @@ -10,15 +10,20 @@ on: merge_group: types: - checks_requested + +permissions: + contents: read + jobs: crdgen: name: CRDs are Generated runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 - - uses: actions/setup-go@v6 + persist-credentials: false + - uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod - name: Regenerate NodeNetworkConfig CRD diff --git a/.github/workflows/cyclonus-netpol-extended-nightly-test.yaml b/.github/workflows/cyclonus-netpol-extended-nightly-test.yaml index ca730a02f0..acf865cb41 100644 --- a/.github/workflows/cyclonus-netpol-extended-nightly-test.yaml +++ b/.github/workflows/cyclonus-netpol-extended-nightly-test.yaml @@ -6,6 +6,9 @@ on: # run once a day at midnight - cron: "0 0 * * *" +permissions: + contents: read + jobs: cyclonus-test: runs-on: ubuntu-latest @@ -22,14 +25,16 @@ jobs: ] steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - - uses: actions/setup-go@v6 + - uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod - name: Setup Kind - uses: helm/kind-action@v1 + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 with: version: "v0.22.0" kubectl_version: "v1.27.7" @@ -71,7 +76,7 @@ jobs: mv ./test/cyclonus/cyclonus-test.txt ./cyclonus-test_${{ matrix.profile }}.txt - name: "Upload Logs" - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 if: always() with: name: logs-${{ matrix.profile }} diff --git a/.github/workflows/cyclonus-netpol-test.yaml b/.github/workflows/cyclonus-netpol-test.yaml index b21a3b86da..7307121447 100644 --- a/.github/workflows/cyclonus-netpol-test.yaml +++ b/.github/workflows/cyclonus-netpol-test.yaml @@ -14,6 +14,9 @@ on: # run once a day at midnight - cron: '0 0 * * *' +permissions: + contents: read + jobs: cyclonus-test: runs-on: ubuntu-latest @@ -29,14 +32,17 @@ jobs: ] steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - - uses: actions/setup-go@v6 + - uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod + cache: false - name: Setup Kind - uses: helm/kind-action@v1 + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 with: version: "v0.22.0" kubectl_version: "v1.27.7" @@ -78,7 +84,7 @@ jobs: mv ./test/cyclonus/cyclonus-test.txt ./cyclonus-test_${{ matrix.profile }}.txt - name: 'Upload Logs' - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 if: always() with: name: logs-${{ matrix.profile }} diff --git a/.github/workflows/golangci.yaml b/.github/workflows/golangci.yaml index ed1e288a87..07e2391e4b 100644 --- a/.github/workflows/golangci.yaml +++ b/.github/workflows/golangci.yaml @@ -10,15 +10,21 @@ on: merge_group: types: - checks_requested + +permissions: + contents: read + jobs: generate: name: Generate BPF Handling Code runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Set up Go - uses: actions/setup-go@v6 + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod @@ -26,7 +32,7 @@ jobs: run: make bpf-lib && go generate ./... - name: Upload generated code - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: generated-bpf-program-code path: ./bpf-prog/azure-block-iptables/pkg/blockservice @@ -39,19 +45,20 @@ jobs: needs: generate runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 - - uses: actions/setup-go@v6 + persist-credentials: false + - uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: go.mod - name: Download generated code - uses: actions/download-artifact@v8 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: generated-bpf-program-code path: ./bpf-prog/azure-block-iptables/pkg/blockservice - name: golangci-lint - uses: golangci/golangci-lint-action@v9 + uses: golangci/golangci-lint-action@1e7e51e771db61008b38414a730f564565cf7c20 # v9.2.0 with: version: latest args: ${{ (github.event_name == 'pull_request' && format('--new-from-rev=origin/{0}', github.base_ref)) || (github.event_name == 'merge_group' && format('--new-from-rev={0}', github.event.merge_group.base_sha)) || '' }} --config=.golangci.yml --timeout=25m diff --git a/.github/workflows/govulncheck.yaml b/.github/workflows/govulncheck.yaml index 1877599e84..99eb6c356a 100644 --- a/.github/workflows/govulncheck.yaml +++ b/.github/workflows/govulncheck.yaml @@ -42,6 +42,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Set up Go uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 @@ -56,6 +58,7 @@ jobs: - name: Go generate if: matrix.bpf run: go generate ./... + working-directory: ${{ matrix.module }} - name: Set up Go uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 @@ -76,6 +79,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Verify matrix covers all go.mod files run: | diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 1ac719dc92..2265059f5b 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -12,7 +12,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@main + - uses: actions/stale@db5d06a4c82d5e94513c09c406638111df61f63e # main id: stale with: ascending: true @@ -27,4 +27,6 @@ jobs: stale-issue-message: 'This issue is stale because it has been open for 2 weeks with no activity. Remove stale label or comment to keep it open.' stale-pr-message: 'This pull request is stale because it has been open for 2 weeks with no activity. Remove stale label or comment or this will be closed in 7 days' - name: Print outputs - run: echo ${{ join(steps.stale.outputs.*, ',') }} + env: + STALE_OUTPUTS: ${{ join(steps.stale.outputs.*, ',') }} + run: echo "$STALE_OUTPUTS" diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000..73f52c8403 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,170 @@ +name: Unit Tests + +on: + push: + branches: + - master + - release/* + pull_request: + branches: + - master + - release/* + merge_group: + branches: + - master + - release/* + workflow_dispatch: + schedule: + - cron: "0 6 * * *" # daily at 06:00 UTC + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.event.merge_group.head_sha || github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.event_name != 'merge_group' }} + +jobs: + setup: + name: Setup + runs-on: ubuntu-latest + outputs: + go-version: ${{ steps.goversion.outputs.version }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Determine Go version + id: goversion + run: | + version=$(find . -name go.mod -exec grep -h '^go ' {} \; | awk '{print $2}' | sort -V | tail -1) + if ! [[ "$version" =~ ^[0-9]+\.[0-9]+(\.[0-9]+)?$ ]]; then + echo "Invalid Go version: $version" + exit 1 + fi + echo "version=$version" >> $GITHUB_OUTPUT + + test-linux: + name: Test Linux (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + needs: setup + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-24.04] + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Set up Go + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6 + with: + go-version: ${{ needs.setup.outputs.go-version }} + + - name: Install system dependencies + run: sudo apt-get update -y && sudo apt-get install -y iptables ipset iproute2 + + - name: Build BPF lib + run: make bpf-lib + + - name: Go generate + run: go generate ./... + + - name: Install go-junit-report + run: make go-junit-report + + - name: Run Tests + run: | + # TODO: Refactor this fd-redirect/tee pipeline to work natively under + # bash strict mode (-e/-o pipefail). Strict mode is temporarily disabled + # here to preserve the original exit-code propagation behavior intact. + set +e +o pipefail + { { { { + sudo -E env "PATH=$PATH" make test-all; + echo $? >&3; + } | tee >($(go env GOPATH)/bin/go-junit-report > report.xml) >&4; + } 3>&1; + } | { read xs; exit $xs; } + } 4>&1 + test_exit=$? + set -eo pipefail + + # combine coverage from multiple modules + (echo "mode: atomic"; tail -q -n +2 coverage-*.out) > coverage.cover + mv coverage.cover linux-coverage.out + + (exit "$test_exit") + + - name: Upload Linux Coverage + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + with: + name: linux-coverage-${{ matrix.os }} + path: linux-coverage.out + + test-windows: + name: Test Windows + runs-on: windows-latest + needs: setup + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Set up Go + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6 + with: + go-version: ${{ needs.setup.outputs.go-version }} + + - name: Run Windows Tests + shell: cmd + run: | + go test -timeout 30m -covermode atomic -coverprofile=windows-coverage.out ./cns/... ./npm/... ./cni/... ./platform/... + go tool cover -func=windows-coverage.out + + - name: Upload Windows Coverage + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + with: + name: windows-coverage + path: windows-coverage.out + + code-coverage: + name: Code Coverage + runs-on: ubuntu-latest + needs: [setup, test-linux] + if: always() + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Set up Go + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6 + with: + go-version: ${{ needs.setup.outputs.go-version }} + + - name: Download Linux Coverage Artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + name: linux-coverage-ubuntu-24.04 + path: ./ + + - name: Generate Coverage Report + run: | + # use go work to include multiple modules or gocov will omit results from those modules + make workspace + + make tools + sudo ln -s $(go env GOPATH)/bin/gocov /usr/local/bin/gocov + sudo ln -s $(go env GOPATH)/bin/gocov-xml /usr/local/bin/gocov-xml + + GOOS=linux gocov convert linux-coverage.out > linux-coverage.json + GOOS=linux gocov-xml < linux-coverage.json > linux-coverage.xml + + mkdir coverage + mv linux-coverage.xml coverage/ + + - name: Upload Coverage Report + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + with: + name: linux-coverage-report + path: coverage/linux-coverage.xml + + - name: Publish Code Coverage Summary + uses: irongut/CodeCoverageSummary@51cc3a756ddcd398d447c044c02cb6aa83fdae95 # v1.3.0 + with: + filename: coverage/linux-coverage.xml + format: markdown + output: both diff --git a/.pipelines/cni/k8s-e2e/k8s-e2e-step-template.yaml b/.pipelines/cni/k8s-e2e/k8s-e2e-step-template.yaml index 56eaa498fb..bc5a9d399a 100644 --- a/.pipelines/cni/k8s-e2e/k8s-e2e-step-template.yaml +++ b/.pipelines/cni/k8s-e2e/k8s-e2e-step-template.yaml @@ -7,7 +7,6 @@ parameters: processes: "" # Number of parallel processes attempts: "" - steps: - script: | set -ex @@ -24,7 +23,7 @@ steps: # Taint Linux nodes so that windows tests do not run on them if ${{ lower(eq(parameters.os, 'windows')) }} then - kubectl rollout status -n kube-system deployment/konnectivity-agent --timeout=3m + kubectl rollout status -n kube-system deployment/konnectivity-agent --timeout=6m kubectl taint nodes -l kubernetes.azure.com/mode=system node-role.kubernetes.io/control-plane:NoSchedule fi diff --git a/.pipelines/pipeline.yaml b/.pipelines/pipeline.yaml index ebf610af91..3a5070ce7c 100644 --- a/.pipelines/pipeline.yaml +++ b/.pipelines/pipeline.yaml @@ -35,7 +35,10 @@ stages: displayName: Setup pool: name: "$(BUILD_POOL_NAME_DEFAULT)" + variables: + prTargetBranch: $[ variables['System.PullRequest.TargetBranchName'] ] steps: + - template: templates/git-diff-check.yaml - script: | # To use the variables below, you must make the respective stage's dependsOn have - setup or it will not retain context of this stage BUILD_NUMBER=$(Build.BuildNumber) @@ -342,7 +345,6 @@ stages: clusterType: nodesubnet-byocni-nokubeproxy-up clusterName: "cilndsubnete2e" vmSize: Standard_B2s - k8sVersion: "" dependsOn: ["test"] # Cilium Overlay E2E tests @@ -404,6 +406,7 @@ stages: dependsOn: ["test"] scaleup: 100 + # Windows Azure Overlay E2E tests - template: singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-job-template.yaml parameters: name: "win_azure_overlay_e2e" @@ -421,7 +424,6 @@ stages: parameters: name: "azure_overlay_stateless_e2e" displayName: Azure Stateless CNI Overlay - os: windows clusterType: overlay-byocni-up clusterName: "statelesswin" vmSize: Standard_B2ms @@ -465,6 +467,7 @@ stages: scaleup: 100 dependsOn: ["test"] + # Windows AKS E2E tests - template: singletenancy/aks/e2e-job-template.yaml parameters: name: "aks_windows_22_e2e" @@ -490,6 +493,7 @@ stages: dependsOn: ["test"] scaleup: 100 + # Windows DualStack Overlay E2E tests - template: singletenancy/dualstack-overlay/dualstackoverlay-e2e-job-template.yaml parameters: name: "win_dualstackoverlay_e2e" @@ -507,111 +511,160 @@ stages: parameters: name: "swiftv2_manifold_e2e" dependsOn: publish - - - stage: delete - displayName: Delete Clusters - condition: always() + + # Delete Cilium Clusters + - stage: delete_cilium_clusters + displayName: Delete Cilium Clusters + condition: and(always(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - setup - - linux_azure_overlay_e2e - - win_azure_overlay_e2e - - azure_overlay_stateless_e2e - - aks_swift_e2e - cilium_e2e - cilium_vnetscale_e2e - cilium_ebpf_podsubnet_e2e - cilium_nodesubnet_e2e - cilium_overlay_e2e - cilium_ebpf_overlay_e2e + - cilium_dualstackoverlay_e2e - cilium_h_overlay_e2e - - aks_ubuntu_22_linux_e2e - - aks_swift_vnetscale_e2e + variables: + commitID: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.commitID'] ] + pool: + name: $(BUILD_POOL_NAME_DEFAULT) + jobs: + - job: delete + displayName: Delete Cluster + strategy: + matrix: + cilium_podsubnet: + cluster_name: ciliume2e + delete_name: cilium_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + cilium_vnetscale: + cluster_name: ciliumvscalee2e + delete_name: cilium_vnetscale_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + cilium_ebpf_podsubnet: + cluster_name: cilbpfpode2e + delete_name: cilium_ebpf_podsubnet_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + cilium_nodesubnet: + cluster_name: cilndsubnete2e + delete_name: cilium_nodesubnet_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + cilium_overlay: + cluster_name: cilovere2e + delete_name: cilium_overlay_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + cilium_ebpf_overlay: + cluster_name: cilbpfovere2e + delete_name: cilium_ebpf_overlay_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + cilium_dualstack_overlay: + cluster_name: cildsovere2e + delete_name: cilium_dualstackoverlay_e2e + delete_region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) + cilium_overlay_hubble: + cluster_name: cilwhleovere2e + delete_name: cilium_h_overlay_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + steps: + - template: templates/delete-cluster.yaml + parameters: + name: $(delete_name) + clusterName: $(cluster_name)-$(commitID) + region: $(delete_region) + sub: $(SUB_AZURE_NETWORK_AGENT_BUILD_VALIDATIONS) + svcConn: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + + # Delete Windows Clusters + - stage: delete_windows_clusters + displayName: Delete Windows Clusters + condition: and(always(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunWindowsTests'], 'true')) + dependsOn: + - setup + - win_azure_overlay_e2e + - azure_overlay_stateless_e2e - aks_windows_22_e2e - - linux_dualstackoverlay_e2e - win_dualstackoverlay_e2e - - cilium_dualstackoverlay_e2e variables: commitID: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.commitID'] ] + pool: + name: $(BUILD_POOL_NAME_DEFAULT) jobs: - - job: delete_build + - job: delete displayName: Delete Cluster - pool: - name: "$(BUILD_POOL_NAME_DEFAULT)" strategy: matrix: - cilium_e2e: - name: cilium_e2e - clusterName: "ciliume2e" - region: $(REGION_AKS_CLUSTER_TEST) - cilium_vnetscale_e2e: - name: cilium_vnetscale_e2e - clusterName: "ciliumvscalee2e" - region: $(REGION_AKS_CLUSTER_TEST) - cilium_ebpf_podsubnet_e2e: - name: cilium_ebpf_podsubnet_e2e - clusterName: "cilbpfpode2e" - region: $(REGION_AKS_CLUSTER_TEST) - cilium_nodesubnet_e2e: - name: cilium_nodesubnet_e2e - clusterName: "cilndsubnete2e" - region: $(REGION_AKS_CLUSTER_TEST) - cilium_overlay_e2e: - name: cilium_overlay_e2e - clusterName: "cilovere2e" - region: $(REGION_AKS_CLUSTER_TEST) - cilium_ebpf_overlay_e2e: - name: cilium_ebpf_overlay_e2e - clusterName: "cilbpfovere2e" - region: $(REGION_AKS_CLUSTER_TEST) - cilium_h_overlay_e2e: - name: cilium_h_overlay_e2e - clusterName: "cilwhleovere2e" - region: $(REGION_AKS_CLUSTER_TEST) - linux_azure_overlay_e2e: - name: linux_azure_overlay_e2e - clusterName: "linuxazovere2e" - region: $(REGION_AKS_CLUSTER_TEST) - windows_azure_overlay_e2e: - name: win_azure_overlay_e2e - clusterName: "winazovere2e" - region: $(REGION_AKS_CLUSTER_TEST) - azure_overlay_stateless_e2e: - name: azure_overlay_stateless_e2e - clusterName: "statelesswin" - region: $(REGION_AKS_CLUSTER_TEST) - aks_swift_e2e: - name: aks_swift_e2e - clusterName: "swifte2e" - region: $(REGION_AKS_CLUSTER_TEST) - aks_swift_vnetscale_e2e: - name: aks_swift_vnetscale_e2e - clusterName: "vscaleswifte2e" - region: $(REGION_AKS_CLUSTER_TEST) - aks_ubuntu_22_linux_e2e: - name: aks_ubuntu_22_linux_e2e - clusterName: "ubuntu22e2e" - region: $(REGION_AKS_CLUSTER_TEST) - aks_windows_22_e2e: - name: aks_windows_22_e2e - clusterName: "win22e2e" - region: $(REGION_AKS_CLUSTER_TEST) - linux_dualstackoverlay_e2e: - name: linux_dualstackoverlay_e2e - clusterName: "linuxdsovere2e" - region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) - windows_dualstackoverlay_e2e: - name: windows_dualstackoverlay_e2e - clusterName: "windsovere2e" - region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) - cilium_dualstackoverlay_e2e: - name: cilium_dualstackoverlay_e2e - clusterName: "cildsovere2e" - region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) + azure_overlay_windows: + cluster_name: winazovere2e + delete_name: win_azure_overlay_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + azure_overlay_stateless: + cluster_name: statelesswin + delete_name: azure_overlay_stateless_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + aks_windows_22: + cluster_name: win22e2e + delete_name: aks_windows_22_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + dualstack_overlay_windows: + cluster_name: windsovere2e + delete_name: win_dualstackoverlay_e2e + delete_region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) steps: - template: templates/delete-cluster.yaml parameters: - name: $(name) - clusterName: $(clusterName)-$(commitID) - region: $(region) + name: $(delete_name) + clusterName: $(cluster_name)-$(commitID) + region: $(delete_region) + sub: $(SUB_AZURE_NETWORK_AGENT_BUILD_VALIDATIONS) + svcConn: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + + # Delete Linux Clusters + - stage: delete_linux_clusters + displayName: Delete Linux Clusters + condition: always() + dependsOn: + - setup + - linux_azure_overlay_e2e + - aks_swift_e2e + - aks_swift_vnetscale_e2e + - aks_ubuntu_22_linux_e2e + - linux_dualstackoverlay_e2e + variables: + commitID: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.commitID'] ] + pool: + name: $(BUILD_POOL_NAME_DEFAULT) + jobs: + - job: delete + displayName: Delete Cluster + strategy: + matrix: + azure_overlay_linux: + cluster_name: linuxazovere2e + delete_name: linux_azure_overlay_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + aks_swift: + cluster_name: swifte2e + delete_name: aks_swift_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + aks_swift_vnetscale: + cluster_name: vscaleswifte2e + delete_name: aks_swift_vnetscale_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + aks_ubuntu_22: + cluster_name: ubuntu22e2e + delete_name: aks_ubuntu_22_linux_e2e + delete_region: $(REGION_AKS_CLUSTER_TEST) + dualstack_overlay_linux: + cluster_name: linuxdsovere2e + delete_name: linux_dualstackoverlay_e2e + delete_region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) + steps: + - template: templates/delete-cluster.yaml + parameters: + name: $(delete_name) + clusterName: $(cluster_name)-$(commitID) + region: $(delete_region) sub: $(SUB_AZURE_NETWORK_AGENT_BUILD_VALIDATIONS) svcConn: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) diff --git a/.pipelines/singletenancy/aks-swift/e2e-job-template.yaml b/.pipelines/singletenancy/aks-swift/e2e-job-template.yaml index 7242b8d8f6..c4c06918af 100644 --- a/.pipelines/singletenancy/aks-swift/e2e-job-template.yaml +++ b/.pipelines/singletenancy/aks-swift/e2e-job-template.yaml @@ -1,12 +1,28 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - os: "" - dependsOn: "" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: os + type: string + default: "" + - name: dependsOn + type: object + default: "" stages: - stage: ${{ parameters.clusterName }} @@ -33,6 +49,7 @@ stages: - stage: ${{ parameters.name }} displayName: E2E-${{ parameters.displayName }} + condition: and(succeeded(), eq(variables.TAG, variables.CURRENT_VERSION)) dependsOn: - setup - publish @@ -44,7 +61,6 @@ stages: GOPATH: "$(Agent.TempDirectory)/go" # Go workspace path GOBIN: "$(GOPATH)/bin" # Go binaries path modulePath: "$(GOPATH)/src/github.com/Azure/azure-container-networking" - condition: and(succeeded(), eq(variables.TAG, variables.CURRENT_VERSION)) pool: name: $(BUILD_POOL_NAME_DEFAULT) jobs: @@ -62,24 +78,21 @@ stages: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) scaleup: 100 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - dependsOn: ${{ parameters.name }} - datapath: true - dns: true - portforward: true - hostport: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + datapath: true + dns: true + portforward: true + hostport: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_linux condition: failed() steps: - template: ../../templates/log-template.yaml @@ -87,3 +100,4 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cniv2 + diff --git a/.pipelines/singletenancy/aks/e2e-job-template.yaml b/.pipelines/singletenancy/aks/e2e-job-template.yaml index c695953203..0eac68f672 100644 --- a/.pipelines/singletenancy/aks/e2e-job-template.yaml +++ b/.pipelines/singletenancy/aks/e2e-job-template.yaml @@ -1,19 +1,45 @@ parameters: - name: "" - displayName: "" - arch: "" - os: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - os_version: "" - scaleup: "" - dependsOn: "" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: arch + type: string + default: "" + - name: os + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: os_version + type: string + default: "" + - name: scaleup + type: string + default: "" + - name: dependsOn + type: object + default: "" stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: or( + eq('${{ parameters.os }}', 'linux'), + and(eq('${{ parameters.os }}', 'windows'), eq(dependencies.setup.outputs['env.GitDiffCheck.RunWindowsTests'], 'true')) + ) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -52,11 +78,7 @@ stages: - job: ${{ parameters.name }} displayName: Singletenancy AKS - (${{ parameters.name }}) timeoutInMinutes: 120 - # continue on error only in linux case to workaround known cniv1 issue where we leak an ip - # this occurs when the azure vnet ipam releases an ip from its statefile and sends a successful - # delete response but the azure cni gets a broken pipe error and doesn't delete the ip from its - # statefile. a symptom of this problem is 'Initializing HTTP client with connection timeout' - # appearing in the logs, which is checked below + # error handled by warning-handler job below continueOnError: ${{ eq(parameters.os, 'linux') }} pool: name: $(BUILD_POOL_NAME_DEFAULT) @@ -72,39 +94,45 @@ stages: os: ${{ parameters.os }} os_version: ${{ parameters.os_version }} scaleup: ${{ parameters.scaleup }} - + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + datapath: true + dns: true + portforward: true + hybridWin: true + service: true + hostport: true + # can only succeed with issues in linux - template: ../../templates/warning-handler-job-template.yaml parameters: dependsOnToEmitWarning: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) fileSearchPattern: "azure-vnet*.log" - searchPattern: "Initializing HTTP client with connection timeout" + # continue on error only in linux case to workaround known cniv1 issue where we leak an ip + # this occurs when the azure vnet ipam releases an ip from its statefile and sends a successful + # delete response but the azure cni gets a broken pipe error and doesn't delete the ip from its + # statefile. a symptom of this problem is 'Initializing HTTP client with connection timeout' + # appearing in the logs, which is checked below + # similar reasoning for exec: already started, which appears to be a bug in the upstream cni framework + # where if an exec fails, the cni framework retries on the same command object which will always fail + # because command objects should never be re-used + searchPattern: "Initializing HTTP client with connection timeout|exec: already started" cni: cniv1 os: ${{ parameters.os }} - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - datapath: true - dns: true - portforward: true - hybridWin: true - service: true - hostport: true - dependsOn: ${{ parameters.name }} - - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} - condition: failed() + condition: or(failed(), eq(dependencies.${{ parameters.name }}.result, 'SucceededWithIssues')) steps: - template: ../../templates/log-template.yaml parameters: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cniv1 + diff --git a/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-job-template.yaml b/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-job-template.yaml index 25e33b7761..cdca3a53f2 100644 --- a/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-job-template.yaml +++ b/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-job-template.yaml @@ -1,15 +1,30 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: eq(dependencies.setup.outputs['env.GitDiffCheck.RunWindowsTests'], 'true') dependsOn: - ${{ parameters.dependsOn }} - setup @@ -17,6 +32,7 @@ stages: name: $(BUILD_POOL_NAME_DEFAULT) variables: commitID: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.commitID'] ] + runWindowsTest: $[ stagedependencies.setup.env.outputs['GitDiffCheck.RunWindowsTests'] ] jobs: - template: ../../templates/create-cluster.yaml parameters: @@ -51,42 +67,55 @@ stages: - job: ${{ parameters.name }}_windows displayName: Azure Stateless CNI Overlay Test Suite | Windows - (${{ parameters.name }}) timeoutInMinutes: 120 + # error handled by warning-handler job below + continueOnError: true pool: name: $(BUILD_POOL_NAME_DEFAULT) demands: - agent.os -equals Linux - Role -equals $(CUSTOM_E2E_ROLE) steps: - - template: azure-cni-overlay-stateless-e2e-step-template.yaml + - template: ../../templates/windows-overlay-e2e-step-template.yaml parameters: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) + cniType: stateless + validateCniType: stateless + testLoadArgs: "INSTALL_AZURE_VNET_STATELESS=true VALIDATE_V4OVERLAY=true" + runWireserverTests: "true" + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) os: windows - vmSizeWin: ${{ parameters.vmSize }} + datapath: true + dns: true + portforward: true + hostport: true + service: true + hybridWin: true - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml + # can only succeed with issues in windows + - template: ../../templates/warning-handler-job-template.yaml parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + dependsOnToEmitWarning: ${{ parameters.name }}_windows clusterName: ${{ parameters.clusterName }}-$(commitID) + fileSearchPattern: "azure-vnet*.log" + # known windows issues that occur only in testing: + # - windows blue screens and writes null characters to the statefile, making it unrecoverable + # - the hns network gets deleted outside of the cni's control, so it still thinks the network exists + searchPattern: "\\x00|\\\\x00|hcnOpenNetwork failed in Win32: The network was not found" + cni: cniv2 os: windows - dependsOn: ${{ parameters.name }}_windows - datapath: true - dns: true - portforward: true - hostport: true - service: true - hybridWin: true - job: failedE2ELogs_windows displayName: "Windows Failure Logs" dependsOn: - ${{ parameters.name }}_windows - - cni_windows - condition: in(dependencies.${{ parameters.name }}_windows.result, 'Failed') + condition: in(dependencies.${{ parameters.name }}_windows.result, 'Failed', 'SucceededWithIssues') steps: - template: ../../templates/log-template.yaml parameters: clusterName: ${{ parameters.clusterName }}-$(commitID) os: windows cni: cniv2 - diff --git a/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-step-template.yaml b/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-step-template.yaml deleted file mode 100644 index 09dbff2188..0000000000 --- a/.pipelines/singletenancy/azure-cni-overlay-stateless/azure-cni-overlay-stateless-e2e-step-template.yaml +++ /dev/null @@ -1,93 +0,0 @@ -parameters: - name: "" - clusterName: "" - os: "" - -steps: - - bash: | - go version - go env - mkdir -p '$(GOBIN)' - mkdir -p '$(GOPATH)/pkg' - mkdir -p '$(modulePath)' - echo '##vso[task.prependpath]$(GOBIN)' - echo '##vso[task.prependpath]$(GOROOT)/bin' - name: "GoEnv" - displayName: "Set up the Go environment" - - - task: KubectlInstaller@0 - inputs: - kubectlVersion: latest - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - make -C ./hack/aks set-kubeconf AZCLI=az CLUSTER=${{ parameters.clusterName }} - name: "kubeconfig" - displayName: "Set Kubeconfig" - - - script: | - nodeList=`kubectl get node -owide | grep Windows | awk '{print $1}'` - for node in $nodeList; do - taint=`kubectl describe node $node | grep Taints | awk '{print $2}'` - if [ $taint == "node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule" ]; then - kubectl taint nodes $node node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule- - fi - done - sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=windows CNI_TYPE=stateless VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_AZURE_VNET_STATELESS=true VALIDATE_V4OVERLAY=true CNS_VERSION=$(make cns-version) CNI_VERSION=$(make cni-version) CLEANUP=true - name: "WindowsOverlayControlPlaneScaleTests" - displayName: "Windows v4Overlay ControlPlane Scale Tests" - retryCountOnTaskFailure: 2 - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - kubectl get po -owide -A - clusterName=${{ parameters.clusterName }} - echo "Restarting nodes" - for val in $(az vmss list -g MC_${clusterName}_${clusterName}_$(REGION_AKS_CLUSTER_TEST) --query "[].name" -o tsv); do - make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=$(REGION_AKS_CLUSTER_TEST) VMSS_NAME=${val} - done - displayName: "Restart Nodes" - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - cd test/integration/load - clusterName=${{ parameters.clusterName }} - make -C ./hack/aks set-kubeconf AZCLI=az CLUSTER=${clusterName} - make -C ./hack/aks azcfg AZCLI=az REGION=$(REGION_AKS_CLUSTER_TEST) - kubectl get pods -owide -A - echo "Validating Node Restart" - CNI_TYPE=stateless RESTART_CASE=true go test -timeout 30m -tags load -run ^TestValidateState$ - displayName: "Validate Node Restart" - retryCountOnTaskFailure: 3 - - - script: | - echo "Run wireserver and metadata connectivity Tests" - bash test/network/wireserver_metadata_test.sh - retryCountOnTaskFailure: 3 - name: "WireserverMetadataConnectivityTests" - displayName: "Run Wireserver and Metadata Connectivity Tests" - - - script: | - echo "IPv4 Overlay DataPath Test" - cd test/integration/datapath - sudo -E env "PATH=$PATH" go test -v -count=1 datapath_windows_test.go -timeout 3m -tags connection -restartKubeproxy true -run ^TestDatapathWin$ - name: "WindowsV4OverlayDatapathTests" - displayName: "Windows v4Overlay Datapath Tests" - retryCountOnTaskFailure: 3 diff --git a/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-job-template.yaml b/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-job-template.yaml index 2999fb3add..c2586456a6 100644 --- a/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-job-template.yaml +++ b/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-job-template.yaml @@ -1,17 +1,39 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" - scaleup: "" - os: "" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" + - name: scaleup + type: string + default: "" + - name: os + type: string + default: "" stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: or( + eq('${{ parameters.os }}', 'linux'), + and(eq('${{ parameters.os }}', 'windows'), eq(dependencies.setup.outputs['env.GitDiffCheck.RunWindowsTests'], 'true')) + ) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -19,6 +41,7 @@ stages: name: $(BUILD_POOL_NAME_DEFAULT) variables: commitID: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.commitID'] ] + runWindowsTest: $[ stagedependencies.setup.env.outputs['GitDiffCheck.RunWindowsTests'] ] jobs: - template: ../../templates/create-cluster.yaml parameters: @@ -50,41 +73,65 @@ stages: - job: ${{ parameters.name }}_${{ parameters.os }} displayName: Azure CNI Overlay Test Suite | ${{ parameters.os }} - (${{ parameters.name }}) timeoutInMinutes: 120 + # error handled by warning-handler job below + continueOnError: ${{ eq(parameters.os, 'windows') }} pool: name: $(BUILD_POOL_NAME_DEFAULT) demands: - agent.os -equals Linux - Role -equals $(CUSTOM_E2E_ROLE) steps: - - template: azure-cni-overlay-e2e-step-template.yaml + - ${{ if eq(parameters.os, 'linux') }}: + - template: azure-cni-overlay-e2e-step-template.yaml + parameters: + name: ${{ parameters.name }} + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + scaleup: ${{ parameters.scaleup }} + - ${{ if eq(parameters.os, 'windows') }}: + - template: ../../templates/windows-overlay-e2e-step-template.yaml + parameters: + name: ${{ parameters.name }} + clusterName: ${{ parameters.clusterName }}-$(commitID) + cniType: cniv2 + validateCniType: cniv2 + testLoadArgs: "INSTALL_AZURE_CNI_OVERLAY=true VALIDATE_V4OVERLAY=true" + scaleup: ${{ parameters.scaleup }} + - template: ../../templates/k8s-e2e-step-template.yaml parameters: - name: ${{ parameters.name }} + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} - scaleup: ${{ parameters.scaleup }} # 50 in windows or 100 in linux + datapath: true + dns: true + portforward: true + hostport: true + service: true + hybridWin: ${{ eq(parameters.os, 'windows') }} - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - dependsOn: ${{ parameters.name }}_${{ parameters.os }} - datapath: true - dns: true - portforward: true - hostport: true - service: true - hybridWin: ${{ eq(parameters.os, 'windows') }} + # can only succeed with issues in windows + - ${{ if eq(parameters.os, 'windows') }}: + - template: ../../templates/warning-handler-job-template.yaml + parameters: + dependsOnToEmitWarning: ${{ parameters.name }}_${{ parameters.os }} + clusterName: ${{ parameters.clusterName }}-$(commitID) + fileSearchPattern: "azure-vnet*.log" + # known windows issues that occur only in testing: + # - windows blue screens and writes null characters to the statefile, making it unrecoverable + # - the hns network gets deleted outside of the cni's control, so it still thinks the network exists + searchPattern: "\\x00|\\\\x00|hcnOpenNetwork failed in Win32: The network was not found" + cni: cniv2 + os: windows - job: failedE2ELogs_${{ parameters.os }} displayName: "${{ parameters.os }} Failure Logs" dependsOn: - ${{ parameters.name }}_${{ parameters.os }} - - CNI_${{ parameters.os }} - condition: in(dependencies.${{ parameters.name }}_${{ parameters.os }}.result, 'Failed') + condition: in(dependencies.${{ parameters.name }}_${{ parameters.os }}.result, 'Failed', 'SucceededWithIssues') steps: - template: ../../templates/log-template.yaml parameters: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cniv2 + diff --git a/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-step-template.yaml b/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-step-template.yaml index 8d8de54f55..38828a0388 100644 --- a/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/azure-cni-overlay/azure-cni-overlay-e2e-step-template.yaml @@ -32,114 +32,55 @@ steps: name: "kubeconfig" displayName: "Set Kubeconfig" - - ${{ if eq(parameters.os, 'linux') }}: - - script: | - echo "Start Integration Tests on Overlay Cluster" - kubectl get po -owide -A - sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=linux CNI_TYPE=cniv2 VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_AZURE_CNI_OVERLAY=true VALIDATE_V4OVERLAY=true AZURE_IPAM_VERSION=$(make azure-ipam-version) CNS_VERSION=$(make cns-version) CNI_VERSION=$(make cni-version) CLEANUP=true - retryCountOnTaskFailure: 2 - name: "integrationTest" - displayName: "Run CNS Integration Tests on AKS Overlay" - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - kubectl get po -owide -A - clusterName=${{ parameters.clusterName }} - echo "Restarting nodes" - for val in $(az vmss list -g MC_${clusterName}_${clusterName}_$(REGION_AKS_CLUSTER_TEST) --query "[].name" -o tsv); do - make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=$(REGION_AKS_CLUSTER_TEST) VMSS_NAME=${val} - done - displayName: "Restart Nodes" - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - cd test/integration/load - - # Scale Cluster Up/Down to confirm functioning CNS - ITERATIONS=2 SCALE_UP=${{ parameters.scaleup }} OS_TYPE=linux go test -count 1 -timeout 30m -tags load -run ^TestLoad$ - kubectl get pods -owide -A - - cd ../../.. - echo "Validating Node Restart" - make test-validate-state OS_TYPE=linux RESTART_CASE=true CNI_TYPE=cniv2 - kubectl delete ns load-test - displayName: "Validate Node Restart" - retryCountOnTaskFailure: 3 - - - script: | - echo "Run wireserver and metadata connectivity Tests" - bash test/network/wireserver_metadata_test.sh - retryCountOnTaskFailure: 3 - name: "WireserverMetadataConnectivityTests" - displayName: "Run Wireserver and Metadata Connectivity Tests" + - script: | + echo "Start Integration Tests on Overlay Cluster" + kubectl get po -owide -A + sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=linux CNI_TYPE=cniv2 VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_AZURE_CNI_OVERLAY=true VALIDATE_V4OVERLAY=true AZURE_IPAM_VERSION=$(make azure-ipam-version) CNS_VERSION=$(make cns-version) CNI_VERSION=$(make cni-version) CLEANUP=true + retryCountOnTaskFailure: 2 + name: "integrationTest" + displayName: "Run CNS Integration Tests on AKS Overlay" - - ${{ if eq(parameters.os, 'windows') }}: - - script: | - nodeList=`kubectl get node -owide | grep Windows | awk '{print $1}'` - for node in $nodeList; do - taint=`kubectl describe node $node | grep Taints | awk '{print $2}'` - if [ $taint == "node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule" ]; then - kubectl taint nodes $node node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule- - fi + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + kubectl get po -owide -A + clusterName=${{ parameters.clusterName }} + echo "Restarting nodes" + for val in $(az vmss list -g MC_${clusterName}_${clusterName}_$(REGION_AKS_CLUSTER_TEST) --query "[].name" -o tsv); do + make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=$(REGION_AKS_CLUSTER_TEST) VMSS_NAME=${val} done - sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=windows CNI_TYPE=cniv2 VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_AZURE_CNI_OVERLAY=true VALIDATE_V4OVERLAY=true CNS_VERSION=$(make cns-version) CNI_VERSION=$(make cni-version) CLEANUP=true - name: "WindowsOverlayControlPlaneScaleTests" - displayName: "Windows v4Overlay ControlPlane Scale Tests" - retryCountOnTaskFailure: 2 + displayName: "Restart Nodes" - - script: | - echo "IPv4 Overlay DataPath Test" - cd test/integration/datapath - sudo -E env "PATH=$PATH" go test -count=1 datapath_windows_test.go -timeout 3m -tags connection -restartKubeproxy true -run ^TestDatapathWin$ - name: "WindowsV4OverlayDatapathTests" - displayName: "Windows v4Overlay Datapath Tests" - retryCountOnTaskFailure: 3 + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + cd test/integration/load - - task: AzureCLI@1 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - kubectl get po -owide -A - clusterName=${{ parameters.clusterName }} - echo "Restarting nodes" - for val in $(az vmss list -g MC_${clusterName}_${clusterName}_$(REGION_AKS_CLUSTER_TEST) --query "[].name" -o tsv); do - make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=$(REGION_AKS_CLUSTER_TEST) VMSS_NAME=${val} - done - displayName: "Restart Nodes" + # Scale Cluster Up/Down to confirm functioning CNS + ITERATIONS=2 SCALE_UP=${{ parameters.scaleup }} OS_TYPE=linux go test -count 1 -timeout 30m -tags load -run ^TestLoad$ + kubectl get pods -owide -A - - task: AzureCLI@1 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - cd test/integration/load + cd ../../.. + echo "Validating Node Restart" + make test-validate-state OS_TYPE=linux RESTART_CASE=true CNI_TYPE=cniv2 + kubectl delete ns load-test + displayName: "Validate Node Restart" + retryCountOnTaskFailure: 3 - # Scale Cluster Up/Down to confirm functioning CNS - ITERATIONS=2 SCALE_UP=${{ parameters.scaleup }} OS_TYPE=windows go test -count 1 -timeout 30m -tags load -run ^TestLoad$ - kubectl get pods -owide -A + - script: | + echo "Run wireserver and metadata connectivity Tests" + bash test/network/wireserver_metadata_test.sh + retryCountOnTaskFailure: 3 + name: "WireserverMetadataConnectivityTests" + displayName: "Run Wireserver and Metadata Connectivity Tests" - cd ../../.. - echo "Validating Node Restart" - make test-validate-state OS_TYPE=windows RESTART_CASE=true CNI_TYPE=cniv2 - kubectl delete ns load-test - displayName: "Validate Node Restart" - retryCountOnTaskFailure: 3 diff --git a/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-job-template.yaml b/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-job-template.yaml index b40ad17004..d53b80da99 100644 --- a/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-job-template.yaml @@ -1,15 +1,33 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" + - name: os + type: string + default: "" stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -31,6 +49,7 @@ stages: - stage: ${{ parameters.name }} displayName: E2E - ${{ parameters.displayName }} + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - setup - publish @@ -58,24 +77,21 @@ stages: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) scaleup: 100 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - cni: cilium - dependsOn: ${{ parameters.name }} - dualstack: true - dns: true - portforward: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + cni: cilium + dualstack: true + dns: true + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml @@ -83,3 +99,4 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cilium + diff --git a/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-step-template.yaml index 280da7d9ff..545f74d036 100644 --- a/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-dualstack-overlay/cilium-dualstackoverlay-e2e-step-template.yaml @@ -48,6 +48,7 @@ steps: kubectl get po -owide -A name: "installCilium" displayName: "Install Cilium on AKS Dualstack Overlay" + retryCountOnTaskFailure: 2 - script: | echo "Start Azilium E2E Tests on Overlay Cluster" diff --git a/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-job-template.yaml b/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-job-template.yaml index 0ea33b2f30..e4aeb661c7 100644 --- a/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-job-template.yaml @@ -61,24 +61,21 @@ stages: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) scaleup: 50 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - cni: cilium - dependsOn: ${{ parameters.name }} - datapath: true - dns: true - portforward: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + cni: cilium + datapath: true + dns: true + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml diff --git a/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-step-template.yaml index a1ed74eefa..08bbb90f8a 100644 --- a/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-ebpf/cilium-e2e-step-template.yaml @@ -30,6 +30,7 @@ steps: kubectl get pods -Aowide name: "installCilium" displayName: "Install EBPF Podsubnet Cilium" + retryCountOnTaskFailure: 2 - script: | echo "Start Azilium E2E Tests" diff --git a/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-job-template.yaml b/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-job-template.yaml index 2d3b8d2a43..1dc272547f 100644 --- a/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-job-template.yaml @@ -1,19 +1,48 @@ parameters: - dependsOn: "" - name: "cilium_nodesubnet_e2e" - clusterType: "nodesubnet-byocni-nokubeproxy-up" - clusterName: "cilndsubnete2e" - vmSize: "" - os: "linux" - arch: "" - osSKU: Ubuntu - hubbleEnabled: false - dualstackVersion: "" - cni: "cilium" + - name: dependsOn + type: object + default: "" + - name: name + type: string + default: cilium_nodesubnet_e2e + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: nodesubnet-byocni-nokubeproxy-up + - name: clusterName + type: string + default: cilndsubnete2e + - name: vmSize + type: string + default: "" + - name: os + type: string + default: linux + - name: arch + type: string + default: "" + - name: osSKU + type: string + default: Ubuntu + - name: hubbleEnabled + type: boolean + default: false + - name: dualstackVersion + type: string + default: "" + - name: cni + type: string + default: cilium + - name: scaleup + type: string + default: "" stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -33,6 +62,7 @@ stages: - stage: ${{ parameters.name }} displayName: E2E-${{ parameters.displayName }} + condition: and(succeeded(), eq(variables.TAG, variables.CURRENT_VERSION), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) variables: TAG: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.Tag'] ] CURRENT_VERSION: $[ stagedependencies.containerize.check_tag.outputs['CurrentTagManifests.currentTagManifests'] ] @@ -40,7 +70,6 @@ stages: GOPATH: "$(Agent.TempDirectory)/go" # Go workspace path GOBIN: "$(GOPATH)/bin" # Go binaries path modulePath: "$(GOPATH)/src/github.com/Azure/azure-container-networking" - condition: and(succeeded(), eq(variables.TAG, variables.CURRENT_VERSION)) dependsOn: - setup - publish @@ -64,24 +93,21 @@ stages: arch: ${{ parameters.arch }} os: ${{ parameters.os }} scaleup: ${{ parameters.scaleup }} - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - datapath: true - dns: true - cni: cilium - portforward: true - service: true - dependsOn: ${{ parameters.name }} + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + datapath: true + dns: true + cni: cilium + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml @@ -89,3 +115,4 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cilium + diff --git a/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-step-template.yaml index 1dfdae206b..280a32351c 100644 --- a/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-nodesubnet/cilium-nodesubnet-e2e-step-template.yaml @@ -70,6 +70,7 @@ steps: kubectl get crd -A name: "installCilium" displayName: "Install Cilium" + retryCountOnTaskFailure: 2 - script: | echo "Start Nodesubnet E2E Tests" diff --git a/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-job-template.yaml b/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-job-template.yaml index 393c0baecf..b6c5850502 100644 --- a/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-job-template.yaml @@ -59,24 +59,21 @@ stages: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) scaleup: 50 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - cni: cilium - dependsOn: ${{ parameters.name }} - datapath: true - dns: true - portforward: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + cni: cilium + datapath: true + dns: true + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml diff --git a/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-step-template.yaml index 3c04840885..23c16dd1cc 100644 --- a/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay-ebpf/cilium-overlay-e2e-step-template.yaml @@ -32,6 +32,7 @@ steps: kubectl get pods -Aowide name: "installCilium" displayName: "Install Cilium EBPF on AKS Overlay" + retryCountOnTaskFailure: 2 - script: | CNS=$(make cns-version) IPAM=$(make azure-ipam-version) diff --git a/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-job-template.yaml b/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-job-template.yaml index 7386ec3b25..cec64e4aa2 100644 --- a/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-job-template.yaml @@ -1,17 +1,36 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" - os: "linux" - testHubble: false + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" + - name: os + type: string + default: linux + - name: testHubble + type: boolean + default: false stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -33,6 +52,7 @@ stages: - stage: ${{ parameters.name }} displayName: E2E-${{ parameters.displayName }} + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - setup - publish @@ -60,24 +80,21 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) testHubble: ${{ parameters.testHubble }} scaleup: 100 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - cni: cilium - dependsOn: ${{ parameters.name }} - datapath: true - dns: true - portforward: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + cni: cilium + datapath: true + dns: true + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml @@ -85,3 +102,4 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cilium + diff --git a/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-step-template.yaml index d6307cf479..077c66b1d5 100644 --- a/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay-withhubble/cilium-overlay-e2e-step-template.yaml @@ -49,6 +49,7 @@ steps: make -C ./hack/aks add-cilium-log-collector name: "installCilium" displayName: "Install Cilium on AKS Overlay" + retryCountOnTaskFailure: 2 - script: | echo "Start Azilium E2E Tests on Overlay Cluster" diff --git a/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-job-template.yaml b/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-job-template.yaml index d5418ec962..27c3436e6c 100644 --- a/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-job-template.yaml @@ -1,16 +1,33 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" - os: "linux" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" + - name: os + type: string + default: linux stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -32,6 +49,7 @@ stages: - stage: ${{ parameters.name }} displayName: E2E-${{ parameters.displayName }} + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - setup - publish @@ -58,24 +76,21 @@ stages: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) scaleup: 100 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - cni: cilium - dependsOn: ${{ parameters.name }} - datapath: true - dns: true - portforward: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + cni: cilium + datapath: true + dns: true + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml @@ -83,3 +98,4 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cilium + diff --git a/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml index 72a16839a5..1f5e5207f2 100644 --- a/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml @@ -62,6 +62,7 @@ steps: kubectl get po -owide -A name: "installCilium" displayName: "Install Cilium on AKS Overlay" + retryCountOnTaskFailure: 2 - script: | echo "Start Azilium E2E Tests on Overlay Cluster" diff --git a/.pipelines/singletenancy/cilium/cilium-e2e-job-template.yaml b/.pipelines/singletenancy/cilium/cilium-e2e-job-template.yaml index 64df80e661..8f0a1b4de5 100644 --- a/.pipelines/singletenancy/cilium/cilium-e2e-job-template.yaml +++ b/.pipelines/singletenancy/cilium/cilium-e2e-job-template.yaml @@ -1,16 +1,33 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" - os: "linux" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" + - name: os + type: string + default: linux stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: and(succeeded(), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -32,6 +49,7 @@ stages: - stage: ${{ parameters.name }} displayName: E2E-${{ parameters.displayName }} + condition: and(succeeded(), eq(variables.TAG, variables.CURRENT_VERSION), eq(dependencies.setup.outputs['env.GitDiffCheck.RunCiliumTests'], 'true')) dependsOn: - setup - publish @@ -43,7 +61,6 @@ stages: GOPATH: "$(Agent.TempDirectory)/go" # Go workspace path GOBIN: "$(GOPATH)/bin" # Go binaries path modulePath: "$(GOPATH)/src/github.com/Azure/azure-container-networking" - condition: and(succeeded(), eq(variables.TAG, variables.CURRENT_VERSION)) pool: name: $(BUILD_POOL_NAME_DEFAULT) jobs: @@ -61,24 +78,21 @@ stages: name: ${{ parameters.name }} clusterName: ${{ parameters.clusterName }}-$(commitID) scaleup: 100 - - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - cni: cilium - dependsOn: ${{ parameters.name }} - datapath: true - dns: true - portforward: true - service: true + - template: ../../templates/k8s-e2e-step-template.yaml + parameters: + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + cni: cilium + datapath: true + dns: true + portforward: true + service: true - job: failedE2ELogs displayName: "Failure Logs" dependsOn: - ${{ parameters.name }} - - cni_${{ parameters.os }} condition: failed() steps: - template: ../../templates/log-template.yaml @@ -86,3 +100,4 @@ stages: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cilium + diff --git a/.pipelines/singletenancy/cilium/cilium-e2e-step-template.yaml b/.pipelines/singletenancy/cilium/cilium-e2e-step-template.yaml index e0c327e892..89085a8c41 100644 --- a/.pipelines/singletenancy/cilium/cilium-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium/cilium-e2e-step-template.yaml @@ -51,6 +51,7 @@ steps: kubectl get po -owide -A name: "installCilium" displayName: "Install Cilium" + retryCountOnTaskFailure: 2 - script: | echo "Start Azilium E2E Tests" diff --git a/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-job-template.yaml b/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-job-template.yaml index bef5fecb76..81dfb87fec 100644 --- a/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-job-template.yaml +++ b/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-job-template.yaml @@ -1,17 +1,39 @@ parameters: - name: "" - displayName: "" - clusterType: "" - clusterName: "" - vmSize: "" - k8sVersion: "" - dependsOn: "" - os: "" - scaleup: "" + - name: name + type: string + default: "" + - name: displayName + type: string + default: "" + - name: clusterType + type: string + default: "" + - name: clusterName + type: string + default: "" + - name: vmSize + type: string + default: "" + - name: k8sVersion + type: string + default: "" + - name: dependsOn + type: object + default: "" + - name: os + type: string + default: "" + - name: scaleup + type: string + default: "" stages: - stage: ${{ parameters.clusterName }} displayName: ${{ parameters.displayName }} - Create Cluster + condition: or( + eq('${{ parameters.os }}', 'linux'), + and(eq('${{ parameters.os }}', 'windows'), eq(dependencies.setup.outputs['env.GitDiffCheck.RunWindowsTests'], 'true')) + ) dependsOn: - ${{ parameters.dependsOn }} - setup @@ -19,6 +41,7 @@ stages: name: $(BUILD_POOL_NAME_DEFAULT) variables: commitID: $[ stagedependencies.setup.env.outputs['EnvironmentalVariables.commitID'] ] + runWindowsTest: $[ stagedependencies.setup.env.outputs['GitDiffCheck.RunWindowsTests'] ] jobs: - template: ../../templates/create-cluster.yaml parameters: @@ -32,7 +55,6 @@ stages: dependsOn: ${{ parameters.dependsOn }} os: ${{ parameters.os }} region: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) # Dualstack has a specific region requirement - - stage: ${{ parameters.name }} displayName: E2E-${{ parameters.displayName }} dependsOn: @@ -50,41 +72,66 @@ stages: - job: ${{ parameters.name }}_${{ parameters.os }} displayName: DualStack Overlay Test Suite | ${{ parameters.os }} - (${{ parameters.name }}) timeoutInMinutes: 120 + # error handled by warning-handler job below + continueOnError: ${{ eq(parameters.os, 'windows') }} pool: name: $(BUILD_POOL_NAME_DEFAULT) demands: - agent.os -equals Linux - Role -equals $(CUSTOM_E2E_ROLE) steps: - - template: dualstackoverlay-e2e-step-template.yaml + - ${{ if eq(parameters.os, 'linux') }}: + - template: dualstackoverlay-e2e-step-template.yaml + parameters: + name: ${{ parameters.name }} + clusterName: ${{ parameters.clusterName }}-$(commitID) + os: ${{ parameters.os }} + scaleup: ${{ parameters.scaleup }} + - ${{ if eq(parameters.os, 'windows') }}: + - template: ../../templates/windows-overlay-e2e-step-template.yaml + parameters: + name: ${{ parameters.name }} + clusterName: ${{ parameters.clusterName }}-$(commitID) + cniType: cniv2 + validateCniType: dualstack + testLoadArgs: "INSTALL_DUALSTACK_OVERLAY=true VALIDATE_DUALSTACK=true" + regionClusterTest: $(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) + scaleup: ${{ parameters.scaleup }} + - template: ../../templates/k8s-e2e-step-template.yaml parameters: - name: ${{ parameters.name }} + sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} - scaleup: ${{ parameters.scaleup }} # 50 in windows or 100 in linux + dualstack: ${{ eq(parameters.os, 'linux') }} # RUN IN LINUX not WINDOWS Currently broken for scenario and blocking releases, HNS is investigating. Covered by go test in E2E step template + dns: true + portforward: true + service: ${{ eq(parameters.os, 'linux') }} # RUN IN LINUX NOT WINDOWS Currently broken for scenario and blocking releases, HNS is investigating. + hostport: true + hybridWin: ${{ eq(parameters.os, 'windows') }} - - template: ../../cni/k8s-e2e/k8s-e2e-job-template.yaml - parameters: - sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - clusterName: ${{ parameters.clusterName }}-$(commitID) - os: ${{ parameters.os }} - dependsOn: ${{ parameters.name }}_${{ parameters.os }} - dualstack: ${{ eq(parameters.os, 'linux') }} # RUN IN LINUX not WINDOWS Currently broken for scenario and blocking releases, HNS is investigating. Covered by go test in E2E step template - dns: true - portforward: true - service: ${{ eq(parameters.os, 'linux') }} # RUN IN LINUX NOT WINDOWS Currently broken for scenario and blocking releases, HNS is investigating. - hostport: true - hybridWin: ${{ eq(parameters.os, 'windows') }} + # can only succeed with issues in windows + - ${{ if eq(parameters.os, 'windows') }}: + - template: ../../templates/warning-handler-job-template.yaml + parameters: + dependsOnToEmitWarning: ${{ parameters.name }}_${{ parameters.os }} + clusterName: ${{ parameters.clusterName }}-$(commitID) + fileSearchPattern: "azure-vnet*.log" + # known windows issues that occur only in testing: + # - windows blue screens and writes null characters to the statefile, making it unrecoverable + # - the hns network gets deleted outside of the cni's control, so it still thinks the network exists + searchPattern: "\\x00|\\\\x00|hcnOpenNetwork failed in Win32: The network was not found" + cni: cniv2 + os: windows - job: failedE2ELogs_${{ parameters.os }} displayName: "${{ parameters.os }} Failure Logs" dependsOn: - ${{ parameters.name }}_${{ parameters.os }} - - CNI_${{ parameters.os }} - condition: in(dependencies.${{ parameters.name }}_${{ parameters.os }}.result, 'Failed') + condition: in(dependencies.${{ parameters.name }}_${{ parameters.os }}.result, 'Failed', 'SucceededWithIssues') steps: - template: ../../templates/log-template.yaml parameters: clusterName: ${{ parameters.clusterName }}-$(commitID) os: ${{ parameters.os }} cni: cniv2 + diff --git a/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-step-template.yaml b/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-step-template.yaml index 71a99f9362..54b83b7cea 100644 --- a/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/dualstack-overlay/dualstackoverlay-e2e-step-template.yaml @@ -33,84 +33,59 @@ steps: name: "kubeconfig" displayName: "Set Kubeconfig" - - ${{ if eq(parameters.os, 'linux') }}: - - script: | - kubectl cluster-info - kubectl get node - kubectl get po -owide -A - sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=linux CNI_TYPE=dualstack VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_DUALSTACK_OVERLAY=true VALIDATE_DUALSTACK=true CNI_VERSION=$(make cni-version) CNS_VERSION=$(make cns-version) CLEANUP=true - retryCountOnTaskFailure: 3 - name: "integrationTest" - displayName: "Run CNS Integration Tests on AKS DualStack Overlay" + - script: | + kubectl cluster-info + kubectl get node + kubectl get po -owide -A + sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=linux CNI_TYPE=dualstack VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_DUALSTACK_OVERLAY=true VALIDATE_DUALSTACK=true CNI_VERSION=$(make cni-version) CNS_VERSION=$(make cns-version) CLEANUP=true + retryCountOnTaskFailure: 3 + name: "integrationTest" + displayName: "Run CNS Integration Tests on AKS DualStack Overlay" - - script: | - set -e - kubectl get po -owide -A - cd test/integration/datapath - echo "Dualstack Overlay Linux datapath IPv6 test" - go test -count=1 datapath_linux_test.go -timeout 3m -tags connection -run ^TestDatapathLinux$ -tags=connection,integration -isDualStack=true - echo "Dualstack Overlay Linux datapath IPv4 test" - go test -count=1 datapath_linux_test.go -timeout 3m -tags connection -run ^TestDatapathLinux$ -tags=connection,integration - retryCountOnTaskFailure: 3 - name: "DualStack_Overlay_Linux_Tests" - displayName: "DualStack Overlay Linux Tests" - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - clusterName=${{ parameters.clusterName }} - echo "Restarting nodes" - for val in $(az vmss list -g MC_${clusterName}_${clusterName}_$(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) --query "[].name" -o tsv); do - make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=$(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) VMSS_NAME=${val} - done - displayName: "Restart Nodes" - - - task: AzureCLI@2 - inputs: - azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) - scriptLocation: "inlineScript" - scriptType: "bash" - addSpnToEnvironment: true - inlineScript: | - set -e - cd test/integration/load + - script: | + set -e + kubectl get po -owide -A + cd test/integration/datapath + echo "Dualstack Overlay Linux datapath IPv6 test" + go test -count=1 datapath_linux_test.go -timeout 3m -tags connection -run ^TestDatapathLinux$ -tags=connection,integration -isDualStack=true + echo "Dualstack Overlay Linux datapath IPv4 test" + go test -count=1 datapath_linux_test.go -timeout 3m -tags connection -run ^TestDatapathLinux$ -tags=connection,integration + retryCountOnTaskFailure: 3 + name: "DualStack_Overlay_Linux_Tests" + displayName: "DualStack Overlay Linux Tests" - # Scale Cluster Up/Down to confirm functioning CNS - ITERATIONS=2 SCALE_UP=${{ parameters.scaleup }} OS_TYPE=linux go test -count 1 -timeout 30m -tags load -run ^TestLoad$ - kubectl get pods -owide -A - - cd ../../.. - echo "Validating Node Restart" - make test-validate-state OS_TYPE=linux RESTART_CASE=true CNI_TYPE=dualstack - kubectl delete ns load-test - displayName: "Validate Node Restart" - retryCountOnTaskFailure: 3 - - - ${{ if eq(parameters.os, 'windows') }}: - - script: | - nodeList=`kubectl get node -owide | grep Windows | awk '{print $1}'` - for node in $nodeList; do - taint=`kubectl describe node $node | grep Taints | awk '{print $2}'` - if [ $taint == "node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule" ]; then - kubectl taint nodes $node node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule- - fi + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + clusterName=${{ parameters.clusterName }} + echo "Restarting nodes" + for val in $(az vmss list -g MC_${clusterName}_${clusterName}_$(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) --query "[].name" -o tsv); do + make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=$(REGION_DUALSTACKOVERLAY_CLUSTER_TEST) VMSS_NAME=${val} done - sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=windows CNI_TYPE=cniv2 VALIDATE_STATEFILE=true INSTALL_CNS=true INSTALL_DUALSTACK_OVERLAY=true VALIDATE_DUALSTACK=true CNI_VERSION=$(make cni-version) CNS_VERSION=$(make cns-version) CLEANUP=true - name: "WindowsDualStackOverlayControlPlaneScaleTests" - displayName: "Windows DualStack Overlay ControlPlane Scale Tests" - retryCountOnTaskFailure: 3 + displayName: "Restart Nodes" + + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + cd test/integration/load - - script: | - echo "DualStack Overlay DataPath Test" - cd test/integration/datapath - sudo -E env "PATH=$PATH" go test -count=1 datapath_windows_test.go -timeout 3m -tags connection -restartKubeproxy true -run ^TestDatapathWin$ - name: "WindowsDualStackOverlayDatapathTests" - displayName: "Windows DualStack Overlay Datapath Tests" - retryCountOnTaskFailure: 3 + # Scale Cluster Up/Down to confirm functioning CNS + ITERATIONS=2 SCALE_UP=${{ parameters.scaleup }} OS_TYPE=linux go test -count 1 -timeout 30m -tags load -run ^TestLoad$ + kubectl get pods -owide -A - # Windows node restart and validation tests removed due to flakiness + cd ../../.. + echo "Validating Node Restart" + make test-validate-state OS_TYPE=linux RESTART_CASE=true CNI_TYPE=dualstack + kubectl delete ns load-test + displayName: "Validate Node Restart" + retryCountOnTaskFailure: 3 diff --git a/.pipelines/templates/git-diff-check.yaml b/.pipelines/templates/git-diff-check.yaml new file mode 100644 index 0000000000..2cdf19782e --- /dev/null +++ b/.pipelines/templates/git-diff-check.yaml @@ -0,0 +1,61 @@ +parameters: + - name: targetBranch + type: string + default: "" + +steps: + - checkout: self + displayName: 'Checkout Repository' + fetchDepth: 0 + + - script: | + set -e + + SOURCE_BRANCH="$(Build.SourceBranch)" + + # Auto-detect target branch if not specified + TARGET_BRANCH="${{ parameters.targetBranch }}" + if [ -z "$TARGET_BRANCH" ]; then + if [[ "$SOURCE_BRANCH" =~ ^refs/tags/ ]]; then + echo "Tag build detected, running all tests" + echo "##vso[task.setvariable variable=RunWindowsTests;isOutput=true]true" + echo "##vso[task.setvariable variable=RunCiliumTests;isOutput=true]true" + exit 0 + elif [ -n "$PRTARGETBRANCH" ]; then + TARGET_BRANCH="$PRTARGETBRANCH" + echo "Auto-detected target branch from PR: $TARGET_BRANCH" + elif [[ "$SOURCE_BRANCH" =~ gh-readonly-queue/(.+)/pr- ]]; then + TARGET_BRANCH="${BASH_REMATCH[1]}" + echo "Auto-detected target branch from merge queue: $TARGET_BRANCH" + else + TARGET_BRANCH="master" + echo "Auto-detected target branch (default): $TARGET_BRANCH" + fi + else + echo "Using provided target branch: $TARGET_BRANCH" + fi + + echo "=== Git Diff Check Configuration ===" + echo "Current branch: $(Build.SourceBranchName)" + echo "Current commit: $(Build.SourceVersion)" + echo "Target branch: $TARGET_BRANCH" + + # Verify the target branch ref is available + if ! git rev-parse --verify "origin/$TARGET_BRANCH" >/dev/null 2>&1; then + echo "origin/$TARGET_BRANCH not found, falling back to master" + TARGET_BRANCH="master" + fi + + OUTPUT=$(bash hack/scripts/check-changed-files.sh "$TARGET_BRANCH") + echo "$OUTPUT" + + RUN_WINDOWS_TESTS=false + RUN_CILIUM_TESTS=false + echo "$OUTPUT" | grep -q '^RUN_WINDOWS_TESTS=true$' && RUN_WINDOWS_TESTS=true + echo "$OUTPUT" | grep -q '^RUN_CILIUM_TESTS=true$' && RUN_CILIUM_TESTS=true + + echo "##vso[task.setvariable variable=RunWindowsTests;isOutput=true]$RUN_WINDOWS_TESTS" + echo "##vso[task.setvariable variable=RunCiliumTests;isOutput=true]$RUN_CILIUM_TESTS" + name: GitDiffCheck + displayName: "Check Git Diff" + condition: always() diff --git a/.pipelines/templates/k8s-e2e-step-template.yaml b/.pipelines/templates/k8s-e2e-step-template.yaml new file mode 100644 index 0000000000..9059e3e0cc --- /dev/null +++ b/.pipelines/templates/k8s-e2e-step-template.yaml @@ -0,0 +1,142 @@ +parameters: + clusterName: "" + os: "" + sub: "" + cni: cni + datapath: false + dns: false + portforward: false + service: false + hostport: false + hybridWin: false + dualstack: false + +steps: + - task: AzureCLI@2 + inputs: + azureSubscription: ${{ parameters.sub }} + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + make -C ./hack/aks set-kubeconf AZCLI=az CLUSTER=${{ parameters.clusterName }} + + # sig-release provides test suite tarball(s) per k8s release. Just need to provide k8s version "v1.xx.xx" + # pulling k8s version from AKS. + eval k8sVersion="v"$( az aks show -g ${{ parameters.clusterName }} -n ${{ parameters.clusterName }} --query "currentKubernetesVersion") + echo $k8sVersion + curl -L https://dl.k8s.io/$k8sVersion/kubernetes-test-linux-amd64.tar.gz -o ./kubernetes-test-linux-amd64.tar.gz + + # https://github.com/kubernetes/sig-release/blob/master/release-engineering/artifacts.md#content-of-kubernetes-test-system-archtargz-on-example-of-kubernetes-test-linux-amd64targz-directories-removed-from-list + # explicitly unzip and strip directories from ginkgo and e2e.test + tar -xvzf kubernetes-test-linux-amd64.tar.gz --strip-components=3 kubernetes/test/bin/ginkgo kubernetes/test/bin/e2e.test + + displayName: "Setup k8s E2E Environment" + - ${{ if contains(parameters.os, 'windows') }}: + - script: | + set -e + kubectl apply -f test/integration/manifests/load/privileged-daemonset-windows.yaml + kubectl rollout status -n kube-system ds privileged-daemonset + + kubectl get pod -n kube-system -l app=privileged-daemonset,os=windows -owide + pods=`kubectl get pod -n kube-system -l app=privileged-daemonset,os=windows --no-headers | awk '{print $1}'` + for pod in $pods; do + kubectl exec -i -n kube-system $pod -- powershell "Restart-Service kubeproxy" + kubectl exec -i -n kube-system $pod -- powershell "Get-Service kubeproxy" + done + name: kubeproxy + displayName: Restart Kubeproxy on Windows nodes + retryCountOnTaskFailure: 3 + - ${{ if eq(parameters.datapath, true) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: Datapath + name: datapath + ginkgoFocus: "(.*).Networking.should|(.*).Networking.Granular|(.*)kubernetes.api|(.*).NoSNAT" + ginkgoSkip: "SCTP|Disruptive|Slow|hostNetwork|kube-proxy|IPv6" + os: ${{ parameters.os }} + processes: 8 + attempts: 10 + - ${{ if eq(parameters.dns, true) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: DNS + name: dns + ginkgoFocus: '\[sig-network\].DNS.should' + ginkgoSkip: "resolv|256 search" + os: ${{ parameters.os }} + processes: 8 + attempts: 3 + - ${{ if eq(parameters.portforward, true) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: Kubectl Portforward + name: portforward + ginkgoFocus: '\[sig-cli\].Kubectl.Port' + ginkgoSkip: "port-forward should keep working after detect broken connection" # affecting k8s 1.32 https://github.com/kubernetes/kubernetes/issues/129803. Note: retry this test in 1.33 with timeout extended + os: ${{ parameters.os }} + processes: 8 + attempts: 3 + - ${{ if and( eq(parameters.service, true), contains(parameters.cni, 'cni') ) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: Service Conformance + name: service + ginkgoFocus: 'Services.*\[Conformance\].*' + ginkgoSkip: "" + os: ${{ parameters.os }} + processes: 8 + attempts: 3 + - ${{ if and( eq(parameters.service, true), contains(parameters.cni, 'cilium') ) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: Service Conformance|Cilium + name: service + ginkgoFocus: 'Services.*\[Conformance\].*' + ginkgoSkip: "should serve endpoints on same port and different protocols" # Cilium does not support this feature. For more info on test: https://github.com/kubernetes/kubernetes/blame/e602e9e03cd744c23dde9fee09396812dd7bdd93/test/conformance/testdata/conformance.yaml#L1780-L1788 + os: ${{ parameters.os }} + processes: 8 + attempts: 3 + - ${{ if eq(parameters.hostport, true) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: Host Port + name: hostport + ginkgoFocus: '\[sig-network\](.*)HostPort|\[sig-scheduling\](.*)hostPort' + ginkgoSkip: "SCTP|exists conflict" # Skip slow 5 minute test + os: ${{ parameters.os }} + processes: 1 # Has a short serial test + attempts: 3 + - ${{ if and(eq(parameters.hybridWin, true), eq(parameters.os, 'windows')) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: Hybrid Network + name: hybrid + ginkgoFocus: '\[sig-windows\].Hybrid' + ginkgoSkip: "" + os: ${{ parameters.os }} + processes: 8 + attempts: 3 + - ${{ if and( eq(parameters.dualstack, true), eq(contains(parameters.cni, 'cilium'), false) ) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: DualStack Test + name: DualStack + clusterName: ${{ parameters.clusterName }} + ginkgoFocus: '\[Feature:IPv6DualStack\]' + ginkgoSkip: "SCTP|session affinity|should function for service endpoints using hostNetwork" # Skip hostNetwork service endpoints while failures are investigated + os: ${{ parameters.os }} + processes: 8 + attempts: 3 + - ${{ if and( eq(parameters.dualstack, true), contains(parameters.cni, 'cilium') ) }}: + - template: ../cni/k8s-e2e/k8s-e2e-step-template.yaml + parameters: + testName: DualStack Test|Cilium + name: DualStack + clusterName: ${{ parameters.clusterName }} + ginkgoFocus: '\[Feature:IPv6DualStack\]' + ginkgoSkip: "SCTP|session affinity|should function for service endpoints using hostNetwork" # Cilium dualstack has a known issue with this test https://github.com/cilium/cilium/issues/25135 + os: ${{ parameters.os }} + processes: 8 + attempts: 3 diff --git a/.pipelines/templates/log-template.yaml b/.pipelines/templates/log-template.yaml index 558a59f4dc..f79c308197 100644 --- a/.pipelines/templates/log-template.yaml +++ b/.pipelines/templates/log-template.yaml @@ -49,6 +49,17 @@ steps: condition: always() continueOnError: true # Tends to fail after node restart due to pods still restarting. This should not block other tests or logs from running. + - bash: | + kubectl get pods -A -owide + displayName: All Pods + condition: always() + + - bash: | + bash hack/scripts/collect-bad-pod-info.sh + displayName: Collect Bad Pod Info + condition: always() + continueOnError: true + - bash: | kubectl describe nodes displayName: Node Status diff --git a/.pipelines/templates/warning-handler-job-template.yaml b/.pipelines/templates/warning-handler-job-template.yaml index e8129f6a76..16f42b548c 100644 --- a/.pipelines/templates/warning-handler-job-template.yaml +++ b/.pipelines/templates/warning-handler-job-template.yaml @@ -32,6 +32,8 @@ jobs: acnLogs=$(System.DefaultWorkingDirectory)/${{ parameters.clusterName }}_check_common_issues acnLogs=$acnLogs cni=${{ parameters.cni }} bash collect-${{ parameters.os }}-logs.sh echo "acnLogs directory: $acnLogs" + echo "fileSearchPattern: ${{ parameters.fileSearchPattern }}" + echo "searchPattern: ${{ parameters.searchPattern }}" pwd bash check-cni-log-contents.sh "$acnLogs" "${{ parameters.fileSearchPattern }}" "${{ parameters.searchPattern }}" displayName: Check for known ${{ parameters.os }} issue diff --git a/.pipelines/templates/windows-overlay-e2e-step-template.yaml b/.pipelines/templates/windows-overlay-e2e-step-template.yaml new file mode 100644 index 0000000000..8054c653ea --- /dev/null +++ b/.pipelines/templates/windows-overlay-e2e-step-template.yaml @@ -0,0 +1,122 @@ +parameters: + name: "" + clusterName: "" + cniType: "cniv2" # CNI_TYPE for make test-load (cniv2 or stateless) + validateCniType: "cniv2" # CNI_TYPE for make test-validate-state (cniv2, stateless, or dualstack) + testLoadArgs: "" # Extra args for make test-load (e.g. "INSTALL_AZURE_CNI_OVERLAY=true VALIDATE_V4OVERLAY=true") + regionClusterTest: "$(REGION_AKS_CLUSTER_TEST)" + scaleup: "32" + runWireserverTests: "false" + +steps: + - bash: | + go version + go env + mkdir -p '$(GOBIN)' + mkdir -p '$(GOPATH)/pkg' + mkdir -p '$(modulePath)' + echo '##vso[task.prependpath]$(GOBIN)' + echo '##vso[task.prependpath]$(GOROOT)/bin' + name: "GoEnv" + displayName: "Set up the Go environment" + + - task: KubectlInstaller@0 + inputs: + kubectlVersion: latest + + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + make -C ./hack/aks set-kubeconf AZCLI=az CLUSTER=${{ parameters.clusterName }} + name: "kubeconfig" + displayName: "Set Kubeconfig" + + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + clusterName=${{ parameters.clusterName }} + + # set cni name to azure so windows cleanup script runs (though testing can still pass without this usually) + cd hack/scripts + bash patch-kubeclusterconfig.sh || true + cd ../.. + + # Restart nodes + echo "Restarting nodes" + kubectl get po -owide -A + for val in $(az vmss list -g MC_${clusterName}_${clusterName}_${{ parameters.regionClusterTest }} --query "[].name" -o tsv); do + make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=${{ parameters.regionClusterTest }} VMSS_NAME=${val} + done + + # Remove taints from Windows nodes + nodeList=`kubectl get node -owide | grep Windows | awk '{print $1}'` + for node in $nodeList; do + taint=`kubectl describe node $node | grep Taints | awk '{print $2}'` + if [ $taint == "node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule" ]; then + kubectl taint nodes $node node.cloudprovider.kubernetes.io/uninitialized=true:NoSchedule- + fi + done + + # Run control plane scale tests + sudo -E env "PATH=$PATH" make test-load SCALE_UP=32 OS_TYPE=windows CNI_TYPE=${{ parameters.cniType }} VALIDATE_STATEFILE=true INSTALL_CNS=true ${{ parameters.testLoadArgs }} CNS_VERSION=$(make cns-version) CNI_VERSION=$(make cni-version) CLEANUP=true + + # Run datapath tests + echo "Windows Overlay DataPath Test" + cd test/integration/datapath + sudo -E env "PATH=$PATH" go test -count=1 datapath_windows_test.go -timeout 12m -tags connection -restartKubeproxy true -run ^TestDatapathWin$ + name: "WindowsOverlayTests" + displayName: "Windows Overlay ControlPlane and DataPath Tests" + retryCountOnTaskFailure: 2 + + - task: AzureCLI@2 + inputs: + azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) + scriptLocation: "inlineScript" + scriptType: "bash" + addSpnToEnvironment: true + inlineScript: | + set -e + clusterName=${{ parameters.clusterName }} + + # set cni name to azure so windows cleanup script runs (though testing can still pass without this usually) + cd hack/scripts + bash patch-kubeclusterconfig.sh || true + cd ../.. + + # Restart nodes + echo "Restarting nodes" + kubectl get po -owide -A + for val in $(az vmss list -g MC_${clusterName}_${clusterName}_${{ parameters.regionClusterTest }} --query "[].name" -o tsv); do + make -C ./hack/aks restart-vmss AZCLI=az CLUSTER=${clusterName} REGION=${{ parameters.regionClusterTest }} VMSS_NAME=${val} + done + + # Scale Cluster Up/Down to confirm functioning CNS + cd test/integration/load + ITERATIONS=2 SCALE_UP=${{ parameters.scaleup }} OS_TYPE=windows go test -count 1 -timeout 30m -tags load -run ^TestLoad$ + kubectl get pods -owide -A + + cd ../../.. + echo "Validating Node Restart" + make test-validate-state OS_TYPE=windows RESTART_CASE=true CNI_TYPE=${{ parameters.validateCniType }} + kubectl delete ns load-test + name: "WindowsOverlayValidateRestart" + displayName: "Validate Windows Overlay Node Restart" + retryCountOnTaskFailure: 2 + + - ${{ if eq(parameters.runWireserverTests, 'true') }}: + - script: | + echo "Run wireserver and metadata connectivity Tests" + bash test/network/wireserver_metadata_test.sh + retryCountOnTaskFailure: 3 + name: "WireserverMetadataConnectivityTests" + displayName: "Run Wireserver and Metadata Connectivity Tests" diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..4c62c9f3fa --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +Use all agents.md found from the root to the current directory as instructions and context. diff --git a/agents.md b/agents.md new file mode 100644 index 0000000000..1670beb654 --- /dev/null +++ b/agents.md @@ -0,0 +1,73 @@ +# agents.md + +Behavioral guidelines to reduce common LLM coding mistakes. Merge with project-specific instructions as needed. + +**Tradeoff:** These guidelines bias toward caution over speed. For trivial tasks, use judgment. + +## 0. Use Repo Skills + +Before implementing, review relevant repo skills in `.github/skills/` and apply them when the task touches Go, APIs, CRDs, controllers, or platform code. + +- Skill use is required when applicable, not optional decoration. +- Multiple skills may apply; combine them when the change crosses concerns. +- For Go work, start with the relevant `acn-go-*` skills and follow them. + +## 1. Think Before Coding + +**Don't assume. Don't hide confusion. Surface tradeoffs.** + +Before implementing: +- State your assumptions explicitly. If uncertain, ask. +- If multiple interpretations exist, present them - don't pick silently. +- If a simpler approach exists, say so. Push back when warranted. +- If something is unclear, stop. Name what's confusing. Ask. + +## 2. Simplicity First + +**Minimum code that solves the problem. Nothing speculative.** + +- No features beyond what was asked. +- No abstractions for single-use code. +- No "flexibility" or "configurability" that wasn't requested. +- No error handling for impossible scenarios. +- If you write 200 lines and it could be 50, rewrite it. + +Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. + +## 3. Surgical Changes + +**Touch only what you must. Clean up only your own mess.** + +When editing existing code: +- Don't "improve" adjacent code, comments, or formatting. +- Don't refactor things that aren't broken. +- Match existing style, even if you'd do it differently. +- If you notice unrelated dead code, mention it - don't delete it. + +When your changes create orphans: +- Remove imports/variables/functions that YOUR changes made unused. +- Don't remove pre-existing dead code unless asked. + +The test: Every changed line should trace directly to the user's request. + +## 4. Goal-Driven Execution + +**Define success criteria. Loop until verified.** + +Transform tasks into verifiable goals: +- "Add validation" → "Write tests for invalid inputs, then make them pass" +- "Fix the bug" → "Write a test that reproduces it, then make it pass" +- "Refactor X" → "Ensure tests pass before and after" + +For multi-step tasks, state a brief plan: +``` +1. [Step] → verify: [check] +2. [Step] → verify: [check] +3. [Step] → verify: [check] +``` + +Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification. + +--- + +**These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes. diff --git a/cns/agents.md b/cns/agents.md new file mode 100644 index 0000000000..067cd52574 --- /dev/null +++ b/cns/agents.md @@ -0,0 +1,66 @@ +# CNS Agent Brief (AKS/Azure CNI) + +This brief is for code contributors working on Container Networking Service (CNS) in this repo. + +## What CNS is +CNS is the container networking daemon running on every AKS node for Azure CNI. It bridges the AKS control plane and node dataplane by managing pod IP allocation state and coordinating with the Azure networking control plane for routable VNET IPs. + +In Azure CNI modes where pods receive routable VNET IPs, CNS tracks the goal state from the NodeNetworkConfig (NNC) CRD and only exposes IPs to pods once the Network Container (NC) is programmed by NMAgent. + +## Core architecture (high level) +- HTTP REST API used by CNI/azure-ipam for IP assignment and release. +- Controller-runtime reconcilers that watch NNCs (and SwiftV2 CRDs) and reconcile desired NC/IP state. +- IPAM pool monitor (dynamic IP scenarios) to resize IP pools on demand. Not used in fixed-prefix modes (e.g., overlay). +- IMDS/wireserver integration to query NMAgent NC status. + +## Core flow (node → pod IP assignment) +1) DNC/RC creates a NodeNetworkConfig (NNC) for the node. +2) DNC allocates IPs as Network Containers (NCs) and publishes them to the fabric. +3) NMAgent programs the NC; CNS watches for NC version readiness via IMDS/wireserver. +4) CNS ingests NNC as goal state; once NC is ready, IPs become Available. +5) CNI/azure-ipam calls CNS to RequestIPConfigs/ReleaseIPConfigs during pod lifecycle. +6) CNS acts as the IPAM store for pod IP allocation and release. + +## Primary CNS APIs used by AKS +Primary (current) IPAM APIs: +- POST /network/requestipconfigs — allocate one or more IPs for a pod. +- POST /network/releaseipconfigs — release IPs for a pod. + +Legacy fallbacks (used when RequestIPConfigs/ReleaseIPConfigs are unsupported): +- POST /network/requestipconfig — allocate a single IP. +- POST /network/releaseipconfig — release a single IP. + +## Configuration inputs +- CNS is configured via JSON config file (cns_config.json) provided via ConfigMap in AKS. +- Config file path can be set by CNS_CONFIGURATION_PATH env var or command-line flag. +- Environment variables used by CNS (examples): NODENAME, NODE_IP, POD_CIDRs, SERVICE_CIDRs, INFRA_VNET_CIDRs. +- There are legacy flags and env vars in main; treat configuration as scenario-specific and static. + +## State persistence +CNS writes persistent state to disk: +- Main CNS state: /var/lib/azure-network/azure-cns.json (Linux default). Stores NCs, networks, orchestrator data, timestamps, etc. +- Endpoint state (ManageEndpointState=true): /var/run/azure-cns/azure-endpoints.json on Linux, /k/azurecns/azure-endpoints.json on Windows. Stores containerID → endpoint IP mappings. Critical state for IPAM - any issues can leak IPs. +- CNI state file used for migration: /var/run/azure-vnet.json (Linux default). + +## Code entry points (start here) +- Service entry: cns/service/main.go +- REST API handlers: cns/restserver/ +- Reconcilers: cns/kubecontroller/ +- IPAM pool monitor: cns/ipampool/ + +## Required repo skills +Before changing CNS code, review and apply the relevant repo skills in `.github/skills/`; this is required for CNS work, not optional. Choose by surface area and combine them when a change crosses concerns: +- REST/IPAM handlers in `cns/restserver/`: `acn-go-http-api-contracts`; also apply `acn-go-types-parsing` for IP/prefix payloads and other network-typed inputs. +- Reconcilers, NNC/SwiftV2 flows, response codes, or persisted/observed state: `acn-go-control-plane-contracts`, with `acn-go-design-boundaries` and `acn-go-interfaces-dependencies` when shaping behavior or dependency seams. +- `main`, startup, pool monitors, background goroutines, long-lived loops, or shutdown: `acn-go-context-lifecycle`. +- Linux/Windows divergence, build tags, or OS-specific paths/state: `acn-go-platform-abstraction`. +- Any change to error strings, zap logs, or operational observability: `acn-go-errors-logging`. + +## Tests to run for CNS changes +- Tests in the packages above (restserver, kubecontroller, ipampool, service) are the first-line regression checks. + +## Critical caution areas +CNS initialization and state management are high-risk code paths. Behavioral changes can cause large-scale impact. Keep changes minimal and validate thoroughly. + +## SwiftV2 note +SwiftV2 adds multitenancy and multi-NIC behaviors. Treat SwiftV2-only APIs/CRDs as specialized paths; verify scenario-specific behavior when touching related code. diff --git a/cns/restserver/api_test.go b/cns/restserver/api_test.go index 4417d4fff4..f68fb7facb 100644 --- a/cns/restserver/api_test.go +++ b/cns/restserver/api_test.go @@ -2095,10 +2095,17 @@ func TestPostNetworkContainersWithIPv6(t *testing.T) { ncResponses, err := getAllNetworkContainers(t, ncParamsWithIPv6) require.NoError(t, err) - // Verify each NC has IPv6Configuration - for i, nc := range ncResponses.NetworkContainers { - assert.Equal(t, ncParamsWithIPv6[i].ipv6Config.IPSubnet.IPAddress, nc.IPv6Configuration.IPSubnet.IPAddress, "NC %d: IPv6 address mismatch", i) - assert.Equal(t, ncParamsWithIPv6[i].ipv6Config.GatewayIPAddress, nc.IPv6Configuration.GatewayIPAddress, "NC %d: IPv6 gateway mismatch", i) + // Verify each NC has IPv6Configuration (index by NC ID to avoid ordering assumptions) + ncByID := make(map[string]cns.GetNetworkContainerResponse, len(ncResponses.NetworkContainers)) + for _, nc := range ncResponses.NetworkContainers { + ncByID[nc.NetworkContainerID] = nc + } + for i, params := range ncParamsWithIPv6 { + ncID := "Swift_" + params.ncID + nc, ok := ncByID[ncID] + require.True(t, ok, "NC %d (%s) not found in response", i, ncID) + assert.Equal(t, params.ipv6Config.IPSubnet.IPAddress, nc.IPv6Configuration.IPSubnet.IPAddress, "NC %d: IPv6 address mismatch", i) + assert.Equal(t, params.ipv6Config.GatewayIPAddress, nc.IPv6Configuration.GatewayIPAddress, "NC %d: IPv6 gateway mismatch", i) } // Cleanup diff --git a/hack/scripts/check-changed-files.sh b/hack/scripts/check-changed-files.sh new file mode 100755 index 0000000000..277eecbc16 --- /dev/null +++ b/hack/scripts/check-changed-files.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -e + +# Usage: check-changed-files.sh +# Outputs two lines: +# RUN_WINDOWS_TESTS=true|false +# RUN_CILIUM_TESTS=true|false + +TARGET_BRANCH="${1:?target branch is required}" + +# Get the merge base to compare against the common ancestor +MERGE_BASE=$(git merge-base HEAD "origin/$TARGET_BRANCH") +echo "Merge base commit: $MERGE_BASE" + +echo "=== Files Changed Compared to $TARGET_BRANCH ===" +CHANGED_FILES=$(git diff --name-only "$MERGE_BASE...HEAD") + +if [ -z "$CHANGED_FILES" ]; then + echo "No files changed, running all tests" + echo "RUN_WINDOWS_TESTS=true" + echo "RUN_CILIUM_TESTS=true" + exit 0 +fi + +echo "$CHANGED_FILES" + +# all_files_match_skip_patterns +# Returns 0 (true) if every changed file matches at least one of the +# supplied patterns (regex for =~ match). Returns 1 otherwise. +all_files_match_skip_patterns() { + local changed_files="$1" + shift + local patterns=("$@") + + for file in $changed_files; do + local match_found=false + for pattern in "${patterns[@]}"; do + if [[ "$file" =~ $pattern ]]; then + match_found=true + break + fi + done + if [[ "$match_found" == false ]]; then + return 1 + fi + done + return 0 +} + +# --- Windows tests --- +WINDOWS_SKIP_PATTERNS=("_linux\.go$" "_linux_test\.go$") + +if all_files_match_skip_patterns "$CHANGED_FILES" "${WINDOWS_SKIP_PATTERNS[@]}"; then + RUN_WINDOWS_TESTS=false +else + RUN_WINDOWS_TESTS=true +fi +echo "Run Windows Tests: $RUN_WINDOWS_TESTS" + +# --- Cilium tests --- +CILIUM_SKIP_PATTERNS=("^cni/" "^ipam/" "^network/") + +if all_files_match_skip_patterns "$CHANGED_FILES" "${CILIUM_SKIP_PATTERNS[@]}"; then + RUN_CILIUM_TESTS=false +else + RUN_CILIUM_TESTS=true +fi +echo "Run Cilium Tests: $RUN_CILIUM_TESTS" + + +echo "RUN_WINDOWS_TESTS=$RUN_WINDOWS_TESTS" +echo "RUN_CILIUM_TESTS=$RUN_CILIUM_TESTS" diff --git a/hack/scripts/check-cni-log-contents.sh b/hack/scripts/check-cni-log-contents.sh index 331e5b7ce0..632a7e834a 100644 --- a/hack/scripts/check-cni-log-contents.sh +++ b/hack/scripts/check-cni-log-contents.sh @@ -12,10 +12,13 @@ file_pattern="$2" content_pattern="$3" # Find files matching pattern and grep for content-- use -aP so we can detect null (\x00) characters -find "$directory" -name "$file_pattern" -type f -exec grep -aP -l "$content_pattern" {} \; +matches=$(find "$directory" -name "$file_pattern" -type f -exec grep -aP -l "$content_pattern" {} +) -# Use exec command terminator "+" to run grep once (unless there are many files) -if find "$directory" -name "$file_pattern" -type f -exec grep -aP -q "$content_pattern" {} +; then +if [ -n "$matches" ]; then + echo "$matches" + echo "" + echo "Matching lines:" + find "$directory" -name "$file_pattern" -type f -exec grep -aP -n "$content_pattern" {} + exit 0 else exit 1 diff --git a/hack/scripts/collect-bad-pod-info.sh b/hack/scripts/collect-bad-pod-info.sh new file mode 100644 index 0000000000..5e09b99539 --- /dev/null +++ b/hack/scripts/collect-bad-pod-info.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +LOG_LINES=600 + +kubectl get pods -A -o wide --no-headers | while read -r line; do + namespace=$(echo "$line" | awk '{print $1}') + pod=$(echo "$line" | awk '{print $2}') + status=$(echo "$line" | awk '{print $4}') + + if [[ "$status" != "Running" ]]; then + echo "==============================" + echo "Namespace: $namespace" + echo "Pod: $pod" + echo "Status: $status" + echo "------------------------------" + echo "Events:" + kubectl describe pod "$pod" -n "$namespace" | awk '/^Events:/,/^$/' + echo "------------------------------" + echo "Init Container Logs:" + for ic in $(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.initContainers[*].name}'); do + echo "--- Init Container: $ic (head) ---" + kubectl logs "$pod" -n "$namespace" -c "$ic" | head -"$LOG_LINES" + echo "--- Init Container: $ic (tail) ---" + kubectl logs "$pod" -n "$namespace" -c "$ic" --tail="$LOG_LINES" + done + echo "------------------------------" + echo "Sidecar Logs:" + for sc in $(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.containers[*].name}'); do + echo "--- Sidecar: $sc (head) ---" + kubectl logs "$pod" -n "$namespace" -c "$sc" | head -"$LOG_LINES" + echo "--- Sidecar: $sc (tail) ---" + kubectl logs "$pod" -n "$namespace" -c "$sc" --tail="$LOG_LINES" + done + echo "==============================" + echo + fi +done diff --git a/hack/scripts/collect-linux-logs.sh b/hack/scripts/collect-linux-logs.sh index 747f8f068d..db07ac994c 100644 --- a/hack/scripts/collect-linux-logs.sh +++ b/hack/scripts/collect-linux-logs.sh @@ -1,5 +1,25 @@ #!/bin/bash # Usage: acnLogs="./linux-logs/" cni="cniv2" bash collect-linux-logs.sh + +# Collect all container logs (init and sidecar) for a given pod into a directory. +# Usage: collect_all_container_logs +collect_all_container_logs() { + local pod=$1 + local outdir=$2 + # Collect logs from all init containers + initContainers=$(kubectl get pod -n kube-system "$pod" -o jsonpath='{.spec.initContainers[*].name}') + for container in $initContainers; do + kubectl logs -n kube-system "$pod" -c "$container" > "${outdir}/${pod}_${container}.log" 2>&1 + echo " Init container log captured: ${outdir}/${pod}_${container}.log" + done + # Collect logs from all (sidecar) containers + containers=$(kubectl get pod -n kube-system "$pod" -o jsonpath='{.spec.containers[*].name}') + for container in $containers; do + kubectl logs -n kube-system "$pod" -c "$container" > "${outdir}/${pod}_${container}.log" 2>&1 + echo " Container log captured: ${outdir}/${pod}_${container}.log" + done +} + echo "Ensure that privileged pod exists on each node" kubectl apply -f ../../test/integration/manifests/load/privileged-daemonset.yaml kubectl rollout status ds -n kube-system privileged-daemonset @@ -90,4 +110,19 @@ if [ ${cni} = 'cilium' ]; then kubectl exec -i -n kube-system $pod -- cilium endpoint list -o json > ${acnLogs}/"$node"_logs/Cilium-output/$file echo "Cilium, $file, captured: ${acnLogs}/"$node"_logs/Cilium-output/$file" done + + echo "------ Cilium kubectl logs ------" + mkdir -p ${acnLogs}/cilium-kubectl-logs/ + echo "Directory created: ${acnLogs}/cilium-kubectl-logs/" + + echo "Capture all container logs from Cilium daemonset pods" + for pod in $ciliumPods; do + collect_all_container_logs "$pod" "${acnLogs}/cilium-kubectl-logs" + done + + echo "Capture all container logs from Cilium operator deployment pods" + ciliumOperatorPods=`kubectl get pods -n kube-system -l name=cilium-operator --no-headers | awk '{print $1}'` + for pod in $ciliumOperatorPods; do + collect_all_container_logs "$pod" "${acnLogs}/cilium-kubectl-logs" + done fi diff --git a/hack/scripts/collect-windows-logs.sh b/hack/scripts/collect-windows-logs.sh index 1225bfb3e7..71fe4e3741 100644 --- a/hack/scripts/collect-windows-logs.sh +++ b/hack/scripts/collect-windows-logs.sh @@ -36,6 +36,11 @@ for pod in $podList; do file="azure-vnet.json" kubectl exec -i -n kube-system $pod -- powershell cat ../../k/$file > ${acnLogs}/"$node"_logs/privileged-output/$file echo "CNI State, $file, captured: ${acnLogs}/"$node"_logs/privileged-output/$file" + + file="windowsnodereset.log" + kubectl exec -i -n kube-system $pod -- powershell cat ../../k/$file > ${acnLogs}/"$node"_logs/privileged-output/$file + echo "Node Reset Log, $file, captured: ${acnLogs}/"$node"_logs/privileged-output/$file" + if [ ${cni} = 'cniv1' ]; then file="azure-vnet-ipam.json" kubectl exec -i -n kube-system $pod -- powershell cat ../../k/$file > ${acnLogs}/"$node"_logs/privileged-output/$file diff --git a/hack/scripts/k8se2e-tests.sh b/hack/scripts/k8se2e-tests.sh index 5fa60b09cc..6fabec1f9a 100644 --- a/hack/scripts/k8se2e-tests.sh +++ b/hack/scripts/k8se2e-tests.sh @@ -1,3 +1,6 @@ +OS=$1 +TYPE=$2 + # Taint Linux nodes so that windows tests do not run on them and ensure no LinuxOnly tests run on windows nodes if [[ 'windows' == $OS ]] then @@ -25,7 +28,7 @@ echo "./ginkgo --nodes=4 \ --num-nodes=2 \ --provider=skeleton \ --ginkgo.focus='(.*).Networking.should|(.*).Networking.Granular|(.*)kubernetes.api' \ ---ginkgo.skip='SCTP|Disruptive|Slow|hostNetwork|kube-proxy|IPv6' \ +--ginkgo.skip="SCTP|Disruptive|Slow|hostNetwork|kube-proxy|IPv6$SKIP" \ --ginkgo.flakeAttempts=3 \ --ginkgo.v \ --node-os-distro=$OS \ diff --git a/hack/scripts/patch-kubeclusterconfig.sh b/hack/scripts/patch-kubeclusterconfig.sh new file mode 100755 index 0000000000..0b20aca2dc --- /dev/null +++ b/hack/scripts/patch-kubeclusterconfig.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# Attempts to patch kubeclusterconfig.json and windowsnodereset.ps1 on Windows nodes +# - Sets Cni.Name to "azure" in kubeclusterconfig.json +# - Replaces '>> $global:LogPath' with '*>> $global:LogPath' in windowsnodereset.ps1 +# Usage: bash patch-kubeclusterconfig.sh + +echo "Patching kubeclusterconfig.json and windowsnodereset.ps1 on Windows nodes" +kubectl apply -f ../../test/integration/manifests/load/privileged-daemonset-windows.yaml +kubectl rollout status ds -n kube-system privileged-daemonset --timeout=5m + +podList=$(kubectl get pods -n kube-system -l os=windows,app=privileged-daemonset --no-headers -o custom-columns=NAME:.metadata.name) +allSucceeded=true +for pod in $podList; do + succeeded=false + for attempt in 1 2 3; do + echo "Attempt $attempt: Patching kubeclusterconfig.json on $pod" + if kubectl exec -n kube-system "$pod" -- powershell.exe -command \ + 'Get-Content "c:\k\kubeclusterconfig.json" -Raw | ConvertFrom-Json | % { $_.Cni.Name = "azure"; $_ } | ConvertTo-Json -Depth 20 | Set-Content "c:\k\kubeclusterconfig.json"'; then + echo "Successfully patched kubeclusterconfig.json on $pod" + succeeded=true + break + else + echo "Failed to patch kubeclusterconfig.json on $pod (attempt $attempt)" + sleep 20 + fi + done + + if [ "$succeeded" = true ]; then + succeeded=false + for attempt in 1 2 3; do + echo "Attempt $attempt: Patching windowsnodereset.ps1 on $pod" + # Replace ">> $global:LogPath" (or "*>> $global:LogPath", "**>>", etc.) with "*>> $global:LogPath" + # so that all output streams are captured to the log file. The regex is idempotent. + if kubectl exec -n kube-system "$pod" -- powershell.exe -command \ + "(Get-Content 'c:\k\windowsnodereset.ps1') -replace '[*]*>> \\\$global:LogPath', '*>> \$global:LogPath' | Set-Content 'c:\k\windowsnodereset.ps1'"; then + echo "Successfully patched windowsnodereset.ps1 on $pod" + succeeded=true + break + else + echo "Failed to patch windowsnodereset.ps1 on $pod (attempt $attempt)" + sleep 20 + fi + done + fi + + if [ "$succeeded" = false ]; then + echo "WARNING: Failed to patch on $pod after 3 attempts" + allSucceeded=false + fi +done + +if [ "$allSucceeded" = true ]; then + echo "All nodes patched successfully" +else + echo "WARNING: Some nodes failed to patch, continuing anyway" +fi + +echo "Cleaning up privileged daemonset" +kubectl delete ds -n kube-system privileged-daemonset --ignore-not-found diff --git a/test/validate/windows_validate.go b/test/validate/windows_validate.go index 3628c818ac..634ac8e2e6 100644 --- a/test/validate/windows_validate.go +++ b/test/validate/windows_validate.go @@ -205,14 +205,19 @@ func hnsStateFileIPs(result []byte) (map[string]string, error) { } // return windows HNS network state +// PowerShell's ConvertTo-Json returns a single object when there is only one network, +// so we try unmarshalling as an array first, then fall back to a single object. func hnsNetworkState(result []byte) ([]HNSNetwork, error) { var hnsNetworkResult []HNSNetwork - err := json.Unmarshal(result, &hnsNetworkResult) - if err != nil { - return nil, errors.Wrap(err, "failed to unmarshal HNS network state file") + if err := json.Unmarshal(result, &hnsNetworkResult); err == nil { + return hnsNetworkResult, nil } - - return hnsNetworkResult, nil + // if unmarshal into array above fails, try unmarshal into single object + var singleResult HNSNetwork + if err := json.Unmarshal(result, &singleResult); err != nil { + return nil, errors.Wrap(err, "failed to unmarshal HNS network state as array or object") + } + return []HNSNetwork{singleResult}, nil } func azureVnetIps(result []byte) (map[string]string, error) { @@ -273,7 +278,7 @@ func validateHNSNetworkState(ctx context.Context, nodes *corev1.NodeList, client if err != nil { return errors.Wrap(err, "failed to exec into privileged pod") } - + log.Printf("results from node %s: %s", nodes.Items[index].Name, result) hnsNetwork, err := hnsNetworkState(result) log.Printf("hnsNetwork: %+v", hnsNetwork) if err != nil { @@ -281,11 +286,15 @@ func validateHNSNetworkState(ctx context.Context, nodes *corev1.NodeList, client } // check hns properties - if len(hnsNetwork) == 1 { - return errors.New("HNS default ext network or azure network does not exist") + if len(hnsNetwork) < 1 { + return errors.New("no HNS networks found") } + foundAzure := false for _, network := range hnsNetwork { + if network.Name == "azure" { + foundAzure = true + } if network.State != 1 { return errors.New("windows HNS network state is not correct") } @@ -300,6 +309,10 @@ func validateHNSNetworkState(ctx context.Context, nodes *corev1.NodeList, client } } } + + if !foundAzure { + return errors.New("HNS 'azure' network does not exist") + } } return nil }