diff --git a/cli/azd/docs/concurrency-model.md b/cli/azd/docs/concurrency-model.md index 6fe33ce7ebb..af6e57dfe6f 100644 --- a/cli/azd/docs/concurrency-model.md +++ b/cli/azd/docs/concurrency-model.md @@ -169,6 +169,64 @@ both attempt initialization; the lock ensures only one succeeds. --- +## Lock Acquisition Order + +Consistent lock ordering prevents deadlocks. The environment persistence path +uses a three-level hierarchy: + +```text +1. manager.saveMu (in-process sync.Mutex — serializes goroutines) +2. local flock (cross-process OS file lock — serializes processes) +3. env.mu (per-Environment sync.RWMutex — protects in-memory map) +``` + +Every code path that persists or reloads environment state acquires these locks +**in the order above**. Never acquire an outer lock while holding an inner one. + +### Why subprocess hooks cannot deadlock + +Parallel service hooks (pre/post-deploy, pre/post-package) may spawn `azd env set` +subprocesses concurrently. Each subprocess is a separate OS process with its **own** +`manager.saveMu` instance — in-process mutexes are not shared across processes. + +Cross-process serialization is handled entirely by **flock** (the OS-level file +lock on the `.env.lock` file). Within each subprocess the same acquisition order +applies (saveMu → flock → env.mu), but since saveMu is per-process and never +shared across process boundaries, circular wait is impossible. + +```text +┌─────────────────────────────────────────────┐ +│ Parent azd process │ +│ │ +│ goroutine A: saveMu → flock → env.mu │ +│ goroutine B: saveMu → flock → env.mu │ +│ (saveMu serializes A and B) │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Hook subprocess 1 (`azd env set FOO=bar`) │ +│ main goroutine: saveMu → flock → env.mu │ +└─────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────┐ +│ Hook subprocess 2 (`azd env set BAZ=qux`) │ +│ main goroutine: saveMu → flock → env.mu │ +└─────────────────────────────────────────────┘ + +Across processes, only flock provides mutual exclusion. +Within a process, saveMu prevents goroutine interleaving. +``` + +### What is held during subprocess invocations + +When the parent process launches hook subprocesses, it does **not** hold saveMu +or flock during the subprocess lifetime. The hook runner starts the subprocess +and waits for it to exit — locks are only acquired when the subprocess (or the +parent) calls `Save`/`Reload`. This means hook subprocesses never contend with +locks held by the parent's hook-launch path. + +--- + ## Adding new concurrent state When you introduce a new field on one of the types above (or a new type that diff --git a/cli/azd/internal/cmd/deploy_progress.go b/cli/azd/internal/cmd/deploy_progress.go index 14c87be6cf9..711ad497a0d 100644 --- a/cli/azd/internal/cmd/deploy_progress.go +++ b/cli/azd/internal/cmd/deploy_progress.go @@ -7,9 +7,11 @@ import ( "context" "fmt" "io" + "regexp" "strings" "sync" "time" + "unicode" ) // Column widths for the progress table. @@ -69,6 +71,21 @@ type deployProgressTracker struct { lastLines int // how many lines we rendered last time (for ANSI overwrite) } +// ansiEscapeRe matches ANSI escape sequences (CSI and OSC). +var ansiEscapeRe = regexp.MustCompile(`\x1b\[[0-9;]*[A-Za-z~]|\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)`) + +// sanitizeServiceName strips ANSI escape sequences and non-printable characters +// from a service name to prevent terminal output manipulation. +func sanitizeServiceName(name string) string { + name = ansiEscapeRe.ReplaceAllString(name, "") + return strings.Map(func(r rune) rune { + if r == '\t' || (unicode.IsPrint(r) && !unicode.IsControl(r)) { + return r + } + return -1 + }, name) +} + func newDeployProgressTracker( writer io.Writer, interactive bool, serviceNames []string, ) *deployProgressTracker { @@ -76,7 +93,7 @@ func newDeployProgressTracker( idx := make(map[string]int, len(serviceNames)) for i, name := range serviceNames { services[i] = &serviceStatus{ - name: name, + name: sanitizeServiceName(name), phase: phaseWaiting, } idx[name] = i @@ -115,7 +132,7 @@ func (t *deployProgressTracker) Update( if !t.interactive { // Non-interactive (CI): one line per event elapsed := svc.elapsed() - line := fmt.Sprintf(" %s: %s", serviceName, phase) + line := fmt.Sprintf(" %s: %s", svc.name, phase) if detail != "" { line += fmt.Sprintf(" (%s)", detail) } diff --git a/cli/azd/internal/cmd/deploy_progress_test.go b/cli/azd/internal/cmd/deploy_progress_test.go index 910cf5fafb7..0f337402be5 100644 --- a/cli/azd/internal/cmd/deploy_progress_test.go +++ b/cli/azd/internal/cmd/deploy_progress_test.go @@ -168,17 +168,40 @@ func TestDeployProgressTracker_DetailTruncation(t *testing.T) { func TestDeployProgressTracker_ServiceNameWithANSI(t *testing.T) { // Security: service names come from azure.yaml and could contain // ANSI escape codes that hijack terminal output. The progress - // tracker should not crash or produce malformed output with such - // names. This is a defense-in-depth test—injection requires a - // malicious azure.yaml the user already controls. + // tracker should strip them to prevent terminal manipulation. ansiName := "\033[31mevil\033[0m" var buf bytes.Buffer tracker := newDeployProgressTracker(&buf, false, []string{ansiName}) - // Should not panic tracker.Update(ansiName, phasePackaging, "building") output := buf.String() - assert.Contains(t, output, "Packaging") + assert.Contains(t, output, "evil: Packaging") + assert.NotContains(t, output, "\033[") +} + +func TestSanitizeServiceName(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + {"plain", "web-api", "web-api"}, + {"ansi_color", "\033[31mred\033[0m", "red"}, + {"ansi_csi_bold", "\033[1mbold\033[22m", "bold"}, + {"csi_tilde_final", "\033[15~keypress", "keypress"}, + {"osc_title", "\033]0;title\x07rest", "rest"}, + {"osc_st_terminator", "\033]0;title\033\\rest", "rest"}, + {"null_bytes", "svc\x00name", "svcname"}, + {"control_chars", "hello\x01\x02world", "helloworld"}, + {"tab_preserved", "svc\tname", "svc\tname"}, + {"unicode_printable", "café-svc", "café-svc"}, + {"empty", "", ""}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, sanitizeServiceName(tt.in)) + }) + } } func TestDeployProgressTracker_EmptyServiceList(t *testing.T) { diff --git a/cli/azd/internal/cmd/up_graph.go b/cli/azd/internal/cmd/up_graph.go index 740baf51467..c1812023211 100644 --- a/cli/azd/internal/cmd/up_graph.go +++ b/cli/azd/internal/cmd/up_graph.go @@ -597,16 +597,90 @@ func (u *UpGraphAction) Run( log.Printf("warning: failed to invalidate state cache: %v", cacheErr) } + // Emit per-phase duration telemetry for backend performance tracking. + totalMs := since(startTime).Milliseconds() + tracing.SetUsageAttributes(fields.PerfTotalDurationMs.Int64(totalMs)) + provDur, deployDur := phaseDurations(result.Steps) + if provDur > 0 { + tracing.SetUsageAttributes(fields.PerfProvisionDurationMs.Int64(provDur.Milliseconds())) + } + if deployDur > 0 { + tracing.SetUsageAttributes(fields.PerfDeployDurationMs.Int64(deployDur.Milliseconds())) + } + return &actions.ActionResult{ Message: &actions.ResultMessage{ Header: fmt.Sprintf( "Your application was provisioned and deployed to Azure in %s.", ux.DurationAsText(since(startTime)), ), + FollowUp: phaseTimingBreakdown(result.Steps), }, }, nil } +// phaseTimingBreakdown computes wall-clock durations for provisioning and deploying phases +// by finding the earliest start and latest end among matching steps. +// Package/publish steps run concurrently with provisioning, so they are excluded from the +// deploy window to avoid showing a deploy duration that exceeds the total. +func phaseTimingBreakdown(steps []exegraph.StepTiming) string { + provDur, deployDur := phaseDurations(steps) + + var lines []string + if provDur > 0 { + lines = append(lines, fmt.Sprintf(" Provisioning: %s", ux.DurationAsText(provDur))) + } + if deployDur > 0 { + lines = append(lines, fmt.Sprintf(" Deploying: %s", ux.DurationAsText(deployDur))) + } + + if len(lines) == 0 { + return "" + } + return strings.Join(lines, "\n") +} + +// phaseDurations computes the wall-clock duration for provisioning and deploying phases. +// Returns zero durations for phases that were not executed. +func phaseDurations(steps []exegraph.StepTiming) (provision, deploy time.Duration) { + var provStart, deployStart time.Time + var provEnd, deployEnd time.Time + + for _, st := range steps { + if st.Status == exegraph.StepSkipped { + continue + } + switch { + case strings.HasPrefix(st.Name, "provision-") || + strings.HasPrefix(st.Name, "cmdhook-preprovision") || + strings.HasPrefix(st.Name, "cmdhook-postprovision"): + if provStart.IsZero() || st.Start.Before(provStart) { + provStart = st.Start + } + if st.End.After(provEnd) { + provEnd = st.End + } + case strings.HasPrefix(st.Name, "deploy-") || + strings.HasPrefix(st.Name, "cmdhook-predeploy") || + strings.HasPrefix(st.Name, "cmdhook-postdeploy"): + if deployStart.IsZero() || st.Start.Before(deployStart) { + deployStart = st.Start + } + if st.End.After(deployEnd) { + deployEnd = st.End + } + } + } + + if !provStart.IsZero() { + provision = provEnd.Sub(provStart) + } + if !deployStart.IsZero() { + deploy = deployEnd.Sub(deployStart) + } + return provision, deploy +} + // changedFlagNames returns the names of flags that were explicitly set on // the FlagSet. Returns nil for a nil FlagSet so callers can pass through // without a guard. Mirrors the changedFlags collection in diff --git a/cli/azd/internal/cmd/up_graph_test.go b/cli/azd/internal/cmd/up_graph_test.go new file mode 100644 index 00000000000..e39cea392fa --- /dev/null +++ b/cli/azd/internal/cmd/up_graph_test.go @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package cmd + +import ( + "testing" + "time" + + "github.com/azure/azure-dev/cli/azd/pkg/exegraph" +) + +func TestPhaseTimingBreakdown(t *testing.T) { + t.Parallel() + base := time.Now() + + tests := []struct { + name string + steps []exegraph.StepTiming + want string + }{ + { + name: "empty steps", + steps: nil, + want: "", + }, + { + name: "provision only", + steps: []exegraph.StepTiming{ + {Name: "provision-infra", Status: exegraph.StepDone, Start: base, End: base.Add(5 * time.Minute)}, + }, + want: " Provisioning: 5 minutes", + }, + { + name: "deploy only", + steps: []exegraph.StepTiming{ + { + Name: "package-web", Status: exegraph.StepDone, + Start: base, End: base.Add(30 * time.Second), + }, + { + Name: "deploy-web", Status: exegraph.StepDone, + Start: base.Add(30 * time.Second), End: base.Add(90 * time.Second), + }, + }, + want: " Deploying: 1 minute", + }, + { + name: "both phases", + steps: []exegraph.StepTiming{ + { + Name: "provision-infra", Status: exegraph.StepDone, + Start: base, End: base.Add(9 * time.Minute), + }, + { + Name: "package-web", Status: exegraph.StepDone, + Start: base.Add(9 * time.Minute), End: base.Add(10 * time.Minute), + }, + { + Name: "deploy-web", Status: exegraph.StepDone, + Start: base.Add(10 * time.Minute), End: base.Add(11 * time.Minute), + }, + }, + want: " Provisioning: 9 minutes\n Deploying: 1 minute", + }, + { + name: "skipped steps excluded", + steps: []exegraph.StepTiming{ + {Name: "provision-infra", Status: exegraph.StepSkipped, Start: time.Time{}, End: time.Time{}}, + {Name: "deploy-web", Status: exegraph.StepDone, Start: base, End: base.Add(45 * time.Second)}, + }, + want: " Deploying: 45 seconds", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := phaseTimingBreakdown(tt.steps) + if got != tt.want { + t.Errorf("phaseTimingBreakdown() =\n%q\nwant:\n%q", got, tt.want) + } + }) + } +} diff --git a/cli/azd/internal/tracing/fields/fields.go b/cli/azd/internal/tracing/fields/fields.go index b5b63065c65..76aa80d4a48 100644 --- a/cli/azd/internal/tracing/fields/fields.go +++ b/cli/azd/internal/tracing/fields/fields.go @@ -703,6 +703,33 @@ var ( Purpose: PerformanceAndHealth, IsMeasurement: true, } + + // PerfProvisionDurationMs is the wall-clock provisioning phase duration in milliseconds. + // Measured from the earliest provision step start to the latest provision step end. + PerfProvisionDurationMs = AttributeKey{ + Key: attribute.Key("perf.provision_duration_ms"), + Classification: SystemMetadata, + Purpose: PerformanceAndHealth, + IsMeasurement: true, + } + + // PerfDeployDurationMs is the wall-clock deploying phase duration in milliseconds. + // Measured from the earliest deploy step start to the latest deploy step end. + // Package and publish steps are excluded (they run concurrently with provisioning). + PerfDeployDurationMs = AttributeKey{ + Key: attribute.Key("perf.deploy_duration_ms"), + Classification: SystemMetadata, + Purpose: PerformanceAndHealth, + IsMeasurement: true, + } + + // PerfTotalDurationMs is the total wall-clock duration for the entire up-graph execution. + PerfTotalDurationMs = AttributeKey{ + Key: attribute.Key("perf.total_duration_ms"), + Classification: SystemMetadata, + Purpose: PerformanceAndHealth, + IsMeasurement: true, + } ) // Pack related fields diff --git a/cli/azd/magefile.go b/cli/azd/magefile.go index 2ff84a0b31e..f56d0f5048a 100644 --- a/cli/azd/magefile.go +++ b/cli/azd/magefile.go @@ -566,15 +566,15 @@ var excludedPlaybackTests = map[string]string{ // Recordings affected by feat/exegraph: the graph-driven up/provision path // introduces legitimate new HTTP interactions (layer hash probes, resource-group // existence checks). Must be re-recorded with live Azure credentials before merge. - "Test_DeploymentStacks": "needs re-record for feat/exegraph graph-driven provision", - "Test_CLI_ProvisionState": "needs re-record for feat/exegraph graph-driven provision", - "Test_CLI_InfraCreateAndDeleteUpperCase": "needs re-record for feat/exegraph graph-driven provision", - "Test_CLI_PreflightQuota_Sub_DefaultCapacity": "stale recording; missing extension registry + resource group interactions", - "Test_CLI_PreflightQuota_Sub_InvalidModelName": "stale recording; missing extension registry + resource group interactions", + "Test_DeploymentStacks": "needs re-record for feat/exegraph graph-driven provision", + "Test_CLI_ProvisionState": "needs re-record for feat/exegraph graph-driven provision", + "Test_CLI_InfraCreateAndDeleteUpperCase": "needs re-record for feat/exegraph graph-driven provision", + "Test_CLI_PreflightQuota_Sub_DefaultCapacity": "stale recording; missing extension registry + resource group interactions", + "Test_CLI_PreflightQuota_Sub_InvalidModelName": "stale recording; missing extension registry + resource group interactions", "Test_CLI_PreflightQuota_Sub_DifferentLocation": "stale recording; missing extension registry + resource group interactions", - "Test_CLI_PreflightQuota_RG_DefaultCapacity": "stale recording; missing extension registry + resource group interactions", - "Test_CLI_PreflightQuota_RG_InvalidVersion": "stale recording; missing extension registry + resource group interactions", - "Test_CLI_PreflightQuota_RG_InvalidModelName": "stale recording; missing extension registry + resource group interactions", + "Test_CLI_PreflightQuota_RG_DefaultCapacity": "stale recording; missing extension registry + resource group interactions", + "Test_CLI_PreflightQuota_RG_InvalidVersion": "stale recording; missing extension registry + resource group interactions", + "Test_CLI_PreflightQuota_RG_InvalidModelName": "stale recording; missing extension registry + resource group interactions", } // discoverPlaybackTests scans the recordings directory for .yaml files and diff --git a/cli/azd/pkg/environment/manager.go b/cli/azd/pkg/environment/manager.go index 832f0311c28..7132f208754 100644 --- a/cli/azd/pkg/environment/manager.go +++ b/cli/azd/pkg/environment/manager.go @@ -107,6 +107,24 @@ type manager struct { // parallel services running publish/deploy/hooks against the same // *Environment cannot corrupt the on-disk .env file or interleave a // partial Reload with another goroutine's writes. + // + // Lock acquisition order (MUST be consistent across all call sites): + // + // 1. manager.saveMu (in-process sync.Mutex, serializes goroutines) + // 2. local flock (cross-process OS file lock via gofrs/flock) + // 3. env.mu (per-Environment sync.RWMutex, protects in-memory map) + // + // Subprocess hooks (`azd env set`) spawn a separate azd process with + // its own saveMu instance (in-process mutexes are not shared across + // process boundaries). Within the subprocess, the same acquisition + // order applies: saveMu → flock → env.mu. The in-process saveMu + // prevents concurrent Save/Reload within the SAME process; the + // cross-process flock prevents concurrent file I/O across DIFFERENT + // processes (e.g., parallel hook subprocesses). No deadlock is + // possible because saveMu is never shared cross-process and flock + // handles inter-process serialization. + // + // See also: docs/concurrency-model.md "Lock Acquisition Order" section. saveMu sync.Mutex // State cache manager for managing cached Azure resource information