Azure · jongio · May 6, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -169,6 +169,64 @@ both attempt initialization; the lock ensures only one succeeds.
 
 ---
 
+## Lock Acquisition Order
+
+Consistent lock ordering prevents deadlocks. The environment persistence path
+uses a three-level hierarchy:
+
+```text
+1. manager.saveMu        (in-process sync.Mutex — serializes goroutines)
+2. local flock            (cross-process OS file lock — serializes processes)
+3. env.mu                 (per-Environment sync.RWMutex — protects in-memory map)
+```
+
+Every code path that persists or reloads environment state acquires these locks
+**in the order above**. Never acquire an outer lock while holding an inner one.
+
+### Why subprocess hooks cannot deadlock
+
+Parallel service hooks (pre/post-deploy, pre/post-package) may spawn `azd env set`
+subprocesses concurrently. Each subprocess is a separate OS process with its **own**
+`manager.saveMu` instance — in-process mutexes are not shared across processes.
+
+Cross-process serialization is handled entirely by **flock** (the OS-level file
+lock on the `.env.lock` file). Within each subprocess the same acquisition order
+applies (saveMu → flock → env.mu), but since saveMu is per-process and never
+shared across process boundaries, circular wait is impossible.
+
+```text
+┌─────────────────────────────────────────────┐
+│ Parent azd process                          │
+│                                             │
+│  goroutine A: saveMu → flock → env.mu      │
+│  goroutine B: saveMu → flock → env.mu      │
+│  (saveMu serializes A and B)               │
+└─────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────┐
+│ Hook subprocess 1 (`azd env set FOO=bar`)   │
+│  main goroutine: saveMu → flock → env.mu   │
+└─────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────┐
+│ Hook subprocess 2 (`azd env set BAZ=qux`)   │
+│  main goroutine: saveMu → flock → env.mu   │
+└─────────────────────────────────────────────┘
+
+Across processes, only flock provides mutual exclusion.
+Within a process, saveMu prevents goroutine interleaving.
+```
+
+### What is held during subprocess invocations
+
+When the parent process launches hook subprocesses, it does **not** hold saveMu
+or flock during the subprocess lifetime. The hook runner starts the subprocess
+and waits for it to exit — locks are only acquired when the subprocess (or the
+parent) calls `Save`/`Reload`. This means hook subprocesses never contend with
+locks held by the parent's hook-launch path.
+
+---
+
 ## Adding new concurrent state
 
 When you introduce a new field on one of the types above (or a new type that

@@ -7,9 +7,11 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
+	"unicode"
 )
 
 // Column widths for the progress table.
@@ -69,14 +71,29 @@ type deployProgressTracker struct {
 	lastLines   int // how many lines we rendered last time (for ANSI overwrite)
 }
 
+// ansiEscapeRe matches ANSI escape sequences (CSI and OSC).
+var ansiEscapeRe = regexp.MustCompile(`\x1b\[[0-9;]*[A-Za-z~]|\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)`)
+
+// sanitizeServiceName strips ANSI escape sequences and non-printable characters
+// from a service name to prevent terminal output manipulation.
+func sanitizeServiceName(name string) string {
+	name = ansiEscapeRe.ReplaceAllString(name, "")
+	return strings.Map(func(r rune) rune {
+		if r == '\t' || (unicode.IsPrint(r) && !unicode.IsControl(r)) {
+			return r
+		}
+		return -1
+	}, name)
+}
+
 func newDeployProgressTracker(
 	writer io.Writer, interactive bool, serviceNames []string,
 ) *deployProgressTracker {
 	services := make([]*serviceStatus, len(serviceNames))
 	idx := make(map[string]int, len(serviceNames))
 	for i, name := range serviceNames {
 		services[i] = &serviceStatus{
-			name:  name,
+			name:  sanitizeServiceName(name),
 			phase: phaseWaiting,
 		}
 		idx[name] = i
@@ -115,7 +132,7 @@ func (t *deployProgressTracker) Update(
 	if !t.interactive {
 		// Non-interactive (CI): one line per event
 		elapsed := svc.elapsed()
-		line := fmt.Sprintf("  %s: %s", serviceName, phase)
+		line := fmt.Sprintf("  %s: %s", svc.name, phase)
 		if detail != "" {
 			line += fmt.Sprintf(" (%s)", detail)
 		}

@@ -168,17 +168,40 @@ func TestDeployProgressTracker_DetailTruncation(t *testing.T) {
 func TestDeployProgressTracker_ServiceNameWithANSI(t *testing.T) {
 	// Security: service names come from azure.yaml and could contain
 	// ANSI escape codes that hijack terminal output. The progress
-	// tracker should not crash or produce malformed output with such
-	// names. This is a defense-in-depth test—injection requires a
-	// malicious azure.yaml the user already controls.
+	// tracker should strip them to prevent terminal manipulation.
 	ansiName := "\033[31mevil\033[0m"
 	var buf bytes.Buffer
 	tracker := newDeployProgressTracker(&buf, false, []string{ansiName})
 
-	// Should not panic
 	tracker.Update(ansiName, phasePackaging, "building")
 	output := buf.String()
-	assert.Contains(t, output, "Packaging")
+	assert.Contains(t, output, "evil: Packaging")
+	assert.NotContains(t, output, "\033[")
+}
+
+func TestSanitizeServiceName(t *testing.T) {
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"plain", "web-api", "web-api"},
+		{"ansi_color", "\033[31mred\033[0m", "red"},
+		{"ansi_csi_bold", "\033[1mbold\033[22m", "bold"},
+		{"csi_tilde_final", "\033[15~keypress", "keypress"},
+		{"osc_title", "\033]0;title\x07rest", "rest"},
+		{"osc_st_terminator", "\033]0;title\033\\rest", "rest"},
+		{"null_bytes", "svc\x00name", "svcname"},
+		{"control_chars", "hello\x01\x02world", "helloworld"},
+		{"tab_preserved", "svc\tname", "svc\tname"},
+		{"unicode_printable", "café-svc", "café-svc"},
+		{"empty", "", ""},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.want, sanitizeServiceName(tt.in))
+		})
+	}
 }
 
 func TestDeployProgressTracker_EmptyServiceList(t *testing.T) {

@@ -597,16 +597,90 @@ func (u *UpGraphAction) Run(
 		log.Printf("warning: failed to invalidate state cache: %v", cacheErr)
 	}
 
+	// Emit per-phase duration telemetry for backend performance tracking.
+	totalMs := since(startTime).Milliseconds()
+	tracing.SetUsageAttributes(fields.PerfTotalDurationMs.Int64(totalMs))
+	provDur, deployDur := phaseDurations(result.Steps)
+	if provDur > 0 {
+		tracing.SetUsageAttributes(fields.PerfProvisionDurationMs.Int64(provDur.Milliseconds()))
+	}
+	if deployDur > 0 {
+		tracing.SetUsageAttributes(fields.PerfDeployDurationMs.Int64(deployDur.Milliseconds()))
+	}
+
 	return &actions.ActionResult{
 		Message: &actions.ResultMessage{
 			Header: fmt.Sprintf(
 				"Your application was provisioned and deployed to Azure in %s.",
 				ux.DurationAsText(since(startTime)),
 			),
+			FollowUp: phaseTimingBreakdown(result.Steps),
 		},
 	}, nil
 }
 
+// phaseTimingBreakdown computes wall-clock durations for provisioning and deploying phases
+// by finding the earliest start and latest end among matching steps.
+// Package/publish steps run concurrently with provisioning, so they are excluded from the
+// deploy window to avoid showing a deploy duration that exceeds the total.
+func phaseTimingBreakdown(steps []exegraph.StepTiming) string {
+	provDur, deployDur := phaseDurations(steps)
+
+	var lines []string
+	if provDur > 0 {
+		lines = append(lines, fmt.Sprintf("  Provisioning: %s", ux.DurationAsText(provDur)))
+	}
+	if deployDur > 0 {
+		lines = append(lines, fmt.Sprintf("  Deploying:    %s", ux.DurationAsText(deployDur)))
+	}
+
+	if len(lines) == 0 {
+		return ""
+	}
+	return strings.Join(lines, "\n")
+}
+
+// phaseDurations computes the wall-clock duration for provisioning and deploying phases.
+// Returns zero durations for phases that were not executed.
+func phaseDurations(steps []exegraph.StepTiming) (provision, deploy time.Duration) {
+	var provStart, deployStart time.Time
+	var provEnd, deployEnd time.Time
+
+	for _, st := range steps {
+		if st.Status == exegraph.StepSkipped {
+			continue
+		}
+		switch {
+		case strings.HasPrefix(st.Name, "provision-") ||
+			strings.HasPrefix(st.Name, "cmdhook-preprovision") ||
+			strings.HasPrefix(st.Name, "cmdhook-postprovision"):
+			if provStart.IsZero() || st.Start.Before(provStart) {
+				provStart = st.Start
+			}
+			if st.End.After(provEnd) {
+				provEnd = st.End
+			}
+		case strings.HasPrefix(st.Name, "deploy-") ||
+			strings.HasPrefix(st.Name, "cmdhook-predeploy") ||
+			strings.HasPrefix(st.Name, "cmdhook-postdeploy"):
+			if deployStart.IsZero() || st.Start.Before(deployStart) {
+				deployStart = st.Start
+			}
+			if st.End.After(deployEnd) {
+				deployEnd = st.End
+			}
+		}
+	}
+
+	if !provStart.IsZero() {
+		provision = provEnd.Sub(provStart)
+	}
+	if !deployStart.IsZero() {
+		deploy = deployEnd.Sub(deployStart)
+	}
+	return provision, deploy
+}
+
 // changedFlagNames returns the names of flags that were explicitly set on
 // the FlagSet. Returns nil for a nil FlagSet so callers can pass through
 // without a guard. Mirrors the changedFlags collection in

@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+package cmd
+
+import (
+	"testing"
+	"time"
+
+	"github.com/azure/azure-dev/cli/azd/pkg/exegraph"
+)
+
+func TestPhaseTimingBreakdown(t *testing.T) {
+	t.Parallel()
+	base := time.Now()
+
+	tests := []struct {
+		name  string
+		steps []exegraph.StepTiming
+		want  string
+	}{
+		{
+			name:  "empty steps",
+			steps: nil,
+			want:  "",
+		},
+		{
+			name: "provision only",
+			steps: []exegraph.StepTiming{
+				{Name: "provision-infra", Status: exegraph.StepDone, Start: base, End: base.Add(5 * time.Minute)},
+			},
+			want: "  Provisioning: 5 minutes",
+		},
+		{
+			name: "deploy only",
+			steps: []exegraph.StepTiming{
+				{
+					Name: "package-web", Status: exegraph.StepDone,
+					Start: base, End: base.Add(30 * time.Second),
+				},
+				{
+					Name: "deploy-web", Status: exegraph.StepDone,
+					Start: base.Add(30 * time.Second), End: base.Add(90 * time.Second),
+				},
+			},
+			want: "  Deploying:    1 minute",
+		},
+		{
+			name: "both phases",
+			steps: []exegraph.StepTiming{
+				{
+					Name: "provision-infra", Status: exegraph.StepDone,
+					Start: base, End: base.Add(9 * time.Minute),
+				},
+				{
+					Name: "package-web", Status: exegraph.StepDone,
+					Start: base.Add(9 * time.Minute), End: base.Add(10 * time.Minute),
+				},
+				{
+					Name: "deploy-web", Status: exegraph.StepDone,
+					Start: base.Add(10 * time.Minute), End: base.Add(11 * time.Minute),
+				},
+			},
+			want: "  Provisioning: 9 minutes\n  Deploying:    1 minute",
+		},
+		{
+			name: "skipped steps excluded",
+			steps: []exegraph.StepTiming{
+				{Name: "provision-infra", Status: exegraph.StepSkipped, Start: time.Time{}, End: time.Time{}},
+				{Name: "deploy-web", Status: exegraph.StepDone, Start: base, End: base.Add(45 * time.Second)},
+			},
+			want: "  Deploying:    45 seconds",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			got := phaseTimingBreakdown(tt.steps)
+			if got != tt.want {
+				t.Errorf("phaseTimingBreakdown() =\n%q\nwant:\n%q", got, tt.want)
+			}
+		})
+	}
+}
@@ -703,6 +703,33 @@ var (
 		Purpose:        PerformanceAndHealth,
 		IsMeasurement:  true,
 	}
+
+	// PerfProvisionDurationMs is the wall-clock provisioning phase duration in milliseconds.
+	// Measured from the earliest provision step start to the latest provision step end.
+	PerfProvisionDurationMs = AttributeKey{
+		Key:            attribute.Key("perf.provision_duration_ms"),
+		Classification: SystemMetadata,
+		Purpose:        PerformanceAndHealth,
+		IsMeasurement:  true,
+	}
+
+	// PerfDeployDurationMs is the wall-clock deploying phase duration in milliseconds.
+	// Measured from the earliest deploy step start to the latest deploy step end.
+	// Package and publish steps are excluded (they run concurrently with provisioning).
+	PerfDeployDurationMs = AttributeKey{
+		Key:            attribute.Key("perf.deploy_duration_ms"),
+		Classification: SystemMetadata,
+		Purpose:        PerformanceAndHealth,
+		IsMeasurement:  true,
+	}
+
+	// PerfTotalDurationMs is the total wall-clock duration for the entire up-graph execution.
+	PerfTotalDurationMs = AttributeKey{
+		Key:            attribute.Key("perf.total_duration_ms"),
+		Classification: SystemMetadata,
+		Purpose:        PerformanceAndHealth,
+		IsMeasurement:  true,
+	}
 )
 
 // Pack related fields

@@ -566,15 +566,15 @@ var excludedPlaybackTests = map[string]string{
 	// Recordings affected by feat/exegraph: the graph-driven up/provision path
 	// introduces legitimate new HTTP interactions (layer hash probes, resource-group
 	// existence checks). Must be re-recorded with live Azure credentials before merge.
-	"Test_DeploymentStacks":                  "needs re-record for feat/exegraph graph-driven provision",
-	"Test_CLI_ProvisionState":                "needs re-record for feat/exegraph graph-driven provision",
-	"Test_CLI_InfraCreateAndDeleteUpperCase":       "needs re-record for feat/exegraph graph-driven provision",
-	"Test_CLI_PreflightQuota_Sub_DefaultCapacity":  "stale recording; missing extension registry + resource group interactions",
-	"Test_CLI_PreflightQuota_Sub_InvalidModelName": "stale recording; missing extension registry + resource group interactions",
+	"Test_DeploymentStacks":                         "needs re-record for feat/exegraph graph-driven provision",
+	"Test_CLI_ProvisionState":                       "needs re-record for feat/exegraph graph-driven provision",
+	"Test_CLI_InfraCreateAndDeleteUpperCase":        "needs re-record for feat/exegraph graph-driven provision",
+	"Test_CLI_PreflightQuota_Sub_DefaultCapacity":   "stale recording; missing extension registry + resource group interactions",
+	"Test_CLI_PreflightQuota_Sub_InvalidModelName":  "stale recording; missing extension registry + resource group interactions",
 	"Test_CLI_PreflightQuota_Sub_DifferentLocation": "stale recording; missing extension registry + resource group interactions",
-	"Test_CLI_PreflightQuota_RG_DefaultCapacity":   "stale recording; missing extension registry + resource group interactions",
-	"Test_CLI_PreflightQuota_RG_InvalidVersion":    "stale recording; missing extension registry + resource group interactions",
-	"Test_CLI_PreflightQuota_RG_InvalidModelName":  "stale recording; missing extension registry + resource group interactions",
+	"Test_CLI_PreflightQuota_RG_DefaultCapacity":    "stale recording; missing extension registry + resource group interactions",
+	"Test_CLI_PreflightQuota_RG_InvalidVersion":     "stale recording; missing extension registry + resource group interactions",
+	"Test_CLI_PreflightQuota_RG_InvalidModelName":   "stale recording; missing extension registry + resource group interactions",
 }
 
 // discoverPlaybackTests scans the recordings directory for .yaml files and