diff --git a/.vscode/cspell.misc.yaml b/.vscode/cspell.misc.yaml index 043e51e8def..8b48fff45c8 100644 --- a/.vscode/cspell.misc.yaml +++ b/.vscode/cspell.misc.yaml @@ -80,26 +80,75 @@ overrides: - filename: docs/architecture/**/*.md words: - azapi + - azext + - appinsights + - coreai + - devcliapprequests + - devdiv + - dmitryr - Errorf + - GDPRAPI - grpcbroker + - KQLX + - Kusto + - Pseudonymized + - Thhmmss + - unhashed - vsrpc - filename: docs/guides/**/*.md words: + - azdext + - dcount + - devcliapprequests - errorlint - Errorf - gofmt - golangci - gosec - jaegertracing + - Kusto - mycommand + - myfeature - mypackage - pflag + - Pseudonymized - staticcheck - stdlib + - tostring + - unhashed - vsrpc - filename: docs/reference/**/*.md words: + - appinit - appservice - Buildpacks + - Ccids - containerapp + - dcount + - devdeviceid + - Entra + - Errorf + - exegraph + - kqlx + - Kusto + - oneauth + - paas + - PBIP + - pbip + - postprovision + - preprovision + - postdeploy + - pseudonymized + - remotebuild + - resourcegroup + - rollups + - startswith - staticwebapp + - toreal + - toscalar + - tostring + - Tpid + - TPID + - unhashed + - vsrpc + - whatif diff --git a/docs/README.md b/docs/README.md index f60a383ff37..d40083d4995 100644 --- a/docs/README.md +++ b/docs/README.md @@ -22,6 +22,8 @@ Task-oriented how-tos for common contributor workflows. - [Adding a New Command](guides/adding-a-new-command.md) — End-to-end walkthrough for new CLI commands - [Creating an Extension](guides/creating-an-extension.md) — How to build and publish an azd extension - [Observability and Tracing](guides/observability.md) — Adding telemetry, traces, and debugging +- [Feature Telemetry Guide](guides/feature-telemetry.md) — End-to-end guide for instrumenting telemetry in new features +- [Telemetry Overview](guides/telemetry-overview.md) — Product-facing overview of azd telemetry metrics and dashboards ## Reference @@ -30,6 +32,8 @@ Schemas, flags, environment variables, and configuration details. - [Environment Variables](reference/environment-variables.md) — All environment variables that configure azd behavior - [azure.yaml Schema](reference/azure-yaml-schema.md) — Project configuration file reference - [Feature Status](reference/feature-status.md) — Current maturity status of all features +- [Telemetry Data Reference](reference/telemetry-data.md) — Complete schema of all telemetry events, fields, and query patterns +- [Telemetry Dashboards & Reports](reference/telemetry-dashboards.md) — Power BI reports, Kusto functions, and analysis tools ## Architecture @@ -39,6 +43,7 @@ System overviews, design context, and decision records. - [Command Execution Model](architecture/command-execution-model.md) — How commands are registered, resolved, and run - [Extension Framework](architecture/extension-framework.md) — gRPC-based extension system architecture - [Provisioning Pipeline](architecture/provisioning-pipeline.md) — How infrastructure provisioning works +- [Telemetry Architecture](architecture/telemetry.md) — End-to-end telemetry pipeline across 3 repositories - [ADR Template](architecture/adr-template.md) — Template for lightweight architecture decision records --- diff --git a/docs/architecture/telemetry.md b/docs/architecture/telemetry.md new file mode 100644 index 00000000000..bf6efd2c34a --- /dev/null +++ b/docs/architecture/telemetry.md @@ -0,0 +1,368 @@ +# Azure Developer CLI — Telemetry Architecture + +> End-to-end reference for how telemetry flows through the azd ecosystem. +> + + +## Overview + +azd telemetry spans three repositories, each owning a distinct layer: + +| Repository | Layer | What It Does | +|-----------|-------|-------------| +| [`Azure/azure-dev`](https://github.com/Azure/azure-dev) | **Instrumentation** | CLI + VS Code extension + extension framework emit OpenTelemetry spans | +| [`devdiv-azure-service-dmitryr/azd-queries`](https://github.com/devdiv-azure-service-dmitryr/azd-queries) | **Pipeline & Governance** | GDPR classification, Kusto table sync, KQL query library | +| [`coreai-microsoft/azure-dev-tools`](https://github.com/coreai-microsoft/azure-dev-tools) → `product-telemetry/azd/` | **Analysis** | Power BI reports, Kusto functions, funnel metrics, investigations | + +## End-to-End Data Flow + +```mermaid +flowchart TB + subgraph Instrumentation ["azure-dev (Instrumentation)"] + CLI["azd CLI
(Go + OpenTelemetry)"] + VSC["VS Code Extension
(@microsoft/vscode-azext-utils)"] + EXT["Extensions
(structured error reporting)"] + end + + subgraph Export ["CLI Export Pipeline"] + MW["Command Middleware
cli/azd/cmd/middleware/telemetry.go"] + OTel["OTel TracerProvider"] + AIExp["App Insights Exporter
SpanToEnvelope()"] + DiskQ["Disk Queue
~/.azd/telemetry/*.trn"] + Upload["azd telemetry upload
(background / deferred)"] + end + + subgraph Ingestion ["Azure Monitor / Kusto"] + AppInsights["Azure Application Insights"] + Kusto["Azure Data Explorer (Kusto)
DDAzureClients.DevCli"] + RawTable["RawEventsAppRequests"] + end + + subgraph Pipeline ["azd-queries (Pipeline & Governance)"] + GDPR["GDPR Classify Pipeline
eng/pipelines/classify.yml"] + GDPRTool["gdpr tool
(export → publish → ingest)"] + GDPRAPI["GDPR API"] + TableSync["Kusto Table Sync
.github/workflows/ci.yml"] + IngestScripts["Ingest Scripts
(templates, template versions)"] + KQLLib["KQL Query Library
(core-usage, insights, aspire, vscode)"] + end + + subgraph Analysis ["azure-dev-tools (Analysis)"] + KustoFn["Kusto Functions
(getAzdEvents, addTemplateColumns, etc.)"] + PBI["Power BI Reports
(KPIs, funnels, user journeys)"] + Investigations["Ad-hoc Investigations"] + end + + CLI --> MW --> OTel --> AIExp --> DiskQ --> Upload --> AppInsights + VSC -->|VS Code telemetry framework| AppInsights + EXT -->|structured errors via host| MW + AppInsights --> Kusto --> RawTable + + GDPR -->|reads azure-dev source| GDPRTool --> GDPRAPI + TableSync --> Kusto + IngestScripts --> Kusto + KQLLib -->|queries| RawTable + + KustoFn -->|deployed to DDAzureClients.DevCli| RawTable + PBI -->|reads via| KustoFn + Investigations -->|ad-hoc KQL| RawTable +``` + +## CLI Telemetry Pipeline (Detail) + +### 1. Initialization + +**File:** `cli/azd/internal/telemetry/telemetry.go` + +When `azd` starts, the telemetry subsystem: + +1. Checks `AZURE_DEV_COLLECT_TELEMETRY` — if set to `"no"`, telemetry is disabled entirely +2. In Cloud Shell, shows a first-run consent notice (creates `~/.azd/first-run` marker) +3. Creates a `StorageQueue` backed by the filesystem at `~/.azd/telemetry/` +4. Initializes the **App Insights Exporter** — a custom OTel `SpanExporter` that converts spans to Application Insights envelopes +5. Optionally adds: + - Stdout trace exporter (via `--trace-log-file`) + - OTLP HTTP exporter (via `--trace-log-url`) +6. Creates an OTel `TracerProvider` with the configured exporters +7. Registers the provider globally via `otel.SetTracerProvider(tp)` + +### 2. Command Execution → Span Creation + +**File:** `cli/azd/cmd/middleware/telemetry.go` + +Every azd command is wrapped by the telemetry middleware: + +``` +user runs `azd deploy` + → middleware.Run() + → tracing.Start(ctx, "cmd.deploy") // creates OTel span + → records attributes: + cmd.entry, cmd.flags, cmd.args.count, + platform.type, installed extensions (id@version) + → runs the actual command action + → on completion: + adds usage attributes (from baggage) + adds perf.interact_time + maps errors via cmd.MapError() + → span.End() +``` + +For **extension commands**, the event name switches to `ext.run` and records `ext.id`, `ext.version`. + +### 3. Span Export → Disk Queue + +**File:** `cli/azd/internal/telemetry/storage_exporter.go` + +The custom exporter: + +1. Receives completed OTel spans +2. Converts each to an Application Insights `Envelope` with `RequestData` (via `SpanToEnvelope()`) +3. Serializes as NDJSON +4. Enqueues to the disk queue (`~/.azd/telemetry/YYYYMMDDThhmmss_retry_random.trn`) +5. Retries enqueue up to 3 times on failure + +### 4. Disk Queue → Upload + +**Files:** `cli/azd/internal/telemetry/storage.go`, `uploader.go` + +The disk queue is a FIFO queue implemented as timestamped files: + +- `Peek()` picks the oldest ready item (not older than `itemFileMaxTimeKept`) +- `Cleanup()` removes stale `.tmp` files and expired items + +Upload happens via `azd telemetry upload` (triggered as a background subprocess): + +1. Acquires `upload.lock` (file lock) +2. Loops: `Peek → Transmit → Remove` +3. Retries up to `maxRetryCount=3` with backoff +4. Handles partial success and `Retry-After` headers from App Insights ingestion + +### 5. App Insights Envelope Format + +**File:** `cli/azd/internal/telemetry/appinsights-exporter/span_to_envelope.go` + +Each span becomes a `contracts.Envelope` containing `RequestData`: + +| Envelope Field | Source | +|---------------|--------| +| `IKey` | Instrumentation key from connection string | +| `Tags[ai.application.ver]` | `service.version` resource attribute | +| `Tags[ai.user.*]` | `UserAccountId`, `UserAuthUserId`, `UserId`, `SessionId` | +| `Properties` | String/bool span attributes | +| `Measurements` | Int64/float64 span attributes | +| `Name` | Span name (e.g., `cmd.deploy`) | +| `Duration` | Span duration (App Insights format) | +| `ResponseCode` | Span status / result code | +| `Success` | Span status == OK | + +Slice attributes are JSON-serialized into `Properties`. + +### 6. Ingestion → Kusto + +Envelopes are POSTed (gzip compressed) to the App Insights ingestion endpoint. From there, data flows into Azure Data Explorer: + +- **Cluster:** `DDAzureClients` +- **Database:** `DevCli` +- **Primary table:** `RawEventsAppRequests` + +## VS Code Extension Telemetry + +**Files:** `ext/vscode/src/telemetry/` + +The VS Code extension uses a **separate telemetry path** from the CLI: + +```mermaid +flowchart LR + VSExt["VS Code Extension"] --> VSFw["VS Code Telemetry Framework
(@microsoft/vscode-azext-utils)"] + VSFw --> AppInsights2["Application Insights"] + AppInsights2 --> Kusto2["Kusto"] +``` + +**Key differences from CLI:** + +| Aspect | CLI | VS Code Extension | +|--------|-----|-------------------| +| Framework | Go + OpenTelemetry | TypeScript + vscode-azext-utils | +| Export | Custom App Insights exporter + disk queue | VS Code telemetry framework (direct) | +| Opt-out | `AZURE_DEV_COLLECT_TELEMETRY=no` | `telemetry.telemetryLevel=off` in VS Code settings | +| Events | `cmd.*`, `ext.*`, `mcp.*`, etc. | `azure-dev.*` (activate, deactivate, tasks, surveys) | + +**Extension events** (`ext/vscode/src/telemetry/telemetryId.ts`): + +- Lifecycle: `azure-dev.activate`, `azure-dev.deactivate` +- CLI command tasks: `deploy`, `provision`, `up`, `down`, `init`, `login`, `restore`, `package` +- Environment/extension actions +- Survey tracking: `azure-dev.survey-check`, `azure-dev.survey-prompt-response` +- Activity statistics: tracks `totalActiveDays` via VS Code Memento storage + +## Extension Framework Telemetry + +**File:** `cli/azd/cmd/middleware/telemetry.go` (host side), `cli/azd/docs/extensions/extension-framework.md` + +Extensions run as separate processes and report back to the azd host: + +```mermaid +flowchart LR + Ext["Extension Process"] -->|structured error| Host["azd Host"] + Host -->|maps to span attributes| MW["Telemetry Middleware"] + MW --> Span["OTel Span
(event: ext.run)"] +``` + +- Extension commands emit `ext.run` events with `ext.id` and `ext.version` +- Extensions can report **structured errors** back to the host via `ExtensionService.ReportError` +- Error result codes follow conventions: + - Service errors: `ext.service..` + - Validation: `ext.validation.*` + - Auth: `ext.auth.*` + - Dependency: `ext.dependency.*` +- Extension lifecycle events: `ext.install`, `ext.upgrade`, `ext.promote` + +## GDPR Classification Pipeline + +**Repo:** `azd-queries` → `eng/pipelines/classify.yml` + +This pipeline ensures all telemetry fields are properly classified for GDPR compliance: + +```mermaid +flowchart TB + subgraph Sources ["azure-dev (source of truth)"] + Events["tracing/events/events.go"] + Fields["tracing/fields/fields.go"] + end + + subgraph Pipeline ["azd-queries (classify pipeline)"] + Export["gdpr export
→ events.json + fields.json"] + Publish["gdpr publish
→ GDPR API"] + end + + subgraph GDPR ["GDPR System"] + API["GDPR API
(product: ai.devcliapprequests)"] + end + + Events --> Export + Fields --> Export + Export --> Publish --> API +``` + +**How it works:** + +1. The pipeline checks out **both** `azd-queries` and `Azure/azure-dev` +2. Builds the `gdpr` Go tool (`eng/tools/gdpr/`) +3. Runs `gdpr export` — parses `events.go` and `fields/` to produce `events.json` and `fields.json` +4. Runs `gdpr publish` — pushes metadata to the GDPR API under product code `ai.devcliapprequests` +5. Runs on a schedule for production/staging environments + +**GDPR tool commands:** +- `export` — extract event/field metadata from Go source +- `publish` — push metadata to GDPR API +- `ingest` — ingest metadata into Kusto +- `delete` — retire/remove fields from classification + +## Consent & Privacy + +### Opt-Out + +| Surface | Mechanism | +|---------|-----------| +| CLI | Set `AZURE_DEV_COLLECT_TELEMETRY=no` | +| VS Code | Set `telemetry.telemetryLevel` to `off` in VS Code settings | +| Cloud Shell | First-run notice shown; opt-out instructions provided | + +### PII Protection + +- **Hashed fields:** `project.template.id`, `project.template.version`, `project.name`, `env.name` are SHA-256 hashed (case-insensitive) before emission +- **Data classifications** are annotated on every field: + - `PublicPersonalData` + - `SystemMetadata` + - `CallstackOrException` + - `CustomerContent` + - `EndUserPseudonymizedInformation` + - `OrganizationalIdentifiableInformation` +- **Privacy review required** for: new telemetry fields, classification changes, any unhashed PII (see `docs/specs/metrics-audit/privacy-review-checklist.md`) + +## Kusto Table Sync & Ingestion + +**Repo:** `azd-queries` → `.github/workflows/ci.yml` + +- On PRs: `./ksd build tables` (validates table definitions) +- On merge: `./ksd sync tables` (publishes table definitions to Kusto) +- Ingestion scripts run to sync template metadata: + - `ingest/ingest-templates.kql` + - `ingest/ingest-template-versions.kql` +- Template metadata is updated via `eng/Update-Templates.ps1` before ingest + +## Key Files Reference + +### azure-dev (Instrumentation) +``` +cli/azd/ +├── cmd/middleware/telemetry.go # Command-level span middleware +├── internal/ +│ ├── telemetry/ +│ │ ├── telemetry.go # Pipeline init, env vars, consent +│ │ ├── storage.go # Disk queue (FIFO) +│ │ ├── storage_exporter.go # OTel exporter → disk queue +│ │ ├── uploader.go # Queue → App Insights upload +│ │ ├── notice.go # First-run consent notice +│ │ └── appinsights-exporter/ +│ │ ├── span_to_envelope.go # Span → App Insights envelope +│ │ ├── transmitter.go # HTTP POST to ingestion +│ │ ├── endpoint_config.go # Connection string parsing +│ │ └── transmit_payload.go # NDJSON serialization +│ └── tracing/ +│ ├── tracing.go # Global tracer +│ ├── attributes.go # Global/usage baggage +│ ├── events/events.go # All event name constants +│ └── fields/ +│ ├── fields.go # All field keys + classifications +│ └── key.go # SHA-256 hashing helpers +ext/vscode/src/telemetry/ +├── telemetryId.ts # Extension event IDs +└── activityStatisticsService.ts # Active days tracking +docs/specs/metrics-audit/ +├── telemetry-schema.md # Canonical schema reference +└── privacy-review-checklist.md # Privacy review process +``` + +### azd-queries (Pipeline & Governance) +``` +eng/ +├── pipelines/classify.yml # GDPR classification pipeline +└── tools/gdpr/ + ├── README.md # Tool documentation + ├── cmd/ # export, publish, ingest, delete + └── pkg/gdpr/convert.go # Event/field → GDPR row conversion +.github/workflows/ +├── ci.yml # Kusto table sync + ingest +└── amplitude-export.yml # Kusto → Amplitude export +core-usage/ # MAU/MEU/MDU, funnels, retention KQL +insights-and-segments/ # Usage by language, template, errors KQL +tables/ # Kusto table definitions +ingest/ # Template ingestion scripts +``` + +### azure-dev-tools (Analysis) +``` +product-telemetry/azd/ +├── Kusto/ +│ ├── Functions/ # Deployed Kusto functions (getAzdEvents, etc.) +│ ├── KPIs/ # KPI seed queries +│ ├── funnel-metrics/ # Funnel framework + queries +│ ├── Backfill/ # Historical backfill scripts +│ └── Investigations/ # Ad-hoc deep dives +├── PowerBI/ # Power BI report projects +│ ├── KPIs/ +│ ├── User Journeys/ +│ ├── Template Health/ +│ ├── Deploy and Provision/ +│ └── ... +├── Reports/ # Written analyses (markdown, KQLX) +└── PPTs/ # Presentations +``` + +## See Also + +- [Feature Telemetry Guide](../guides/feature-telemetry.md) — How to add telemetry for new features +- [Telemetry Data Reference](../reference/telemetry-data.md) — Schema, events, fields, query patterns +- [Dashboards & Reports](../reference/telemetry-dashboards.md) — Power BI, Kusto functions, funnel metrics +- [Telemetry Overview](../guides/telemetry-overview.md) — For product managers and leadership diff --git a/docs/guides/feature-telemetry.md b/docs/guides/feature-telemetry.md new file mode 100644 index 00000000000..0789760928a --- /dev/null +++ b/docs/guides/feature-telemetry.md @@ -0,0 +1,256 @@ +# Feature Telemetry Guide — Adding Telemetry to New Features + +> End-to-end guide for instrumenting telemetry when building new azd features. +> Ensures telemetry is designed alongside the feature, not bolted on after. +> + + +## Why This Matters + +Telemetry is not a separate system — it's part of the feature. When you ship a feature without telemetry: +- Product can't measure adoption or success +- Engineering can't diagnose failures in production +- The GDPR classification pipeline doesn't know about your new data +- Dashboards show gaps that require scrambling to fill later + +This guide walks through the full telemetry lifecycle, connecting three repositories: + +| Step | Repository | What You Do | +|------|-----------|-------------| +| 1–4 | `azure-dev` | Define events, fields, and instrument code | +| 5 | `azd-queries` | Ensure GDPR classification is updated | +| 6 | `azure-dev-tools` | Add queries and dashboard coverage | +| 7 | `azure-dev` (PR) | Get product review on telemetry | + +## Step 1: Define Your Events + +**File:** `cli/azd/internal/tracing/events/events.go` + +Add a constant for your feature's event name(s). Follow the naming conventions: + +| Pattern | When to Use | Example | +|---------|------------|---------| +| `cmd.` | Automatically generated for commands | `cmd.deploy` (via `GetCommandEventName`) | +| `.` | Non-command operations | `container.publish`, `hooks.exec` | +| `.` | Scoped events | `arm.deploy.subscription` | + +```go +// In events.go — add your event constant +const ( + // MyFeatureEvent tracks the execution of the my-feature operation. + MyFeatureEvent = "myfeature.execute" +) +``` + +> **Note:** Command events (`cmd.*`) are created automatically by the telemetry middleware via +> `events.GetCommandEventName(...)`. You only need to define explicit event constants for +> non-command operations (sub-spans, background work, etc.). + +## Step 2: Define Your Fields + +**File:** `cli/azd/internal/tracing/fields/fields.go` + +Add `AttributeKey` variables for any new properties your feature emits. Every field must have: + +1. **A key name** — descriptive, dot-separated, lowercase +2. **A classification** — what kind of data is this (see [Data Classifications](#data-classifications)) +3. **A purpose** — why are we collecting it (see [Purposes](#purposes)) +4. **`IsMeasurement: true`** if the value is numeric (goes to `Measurements` column, not `Properties`) + +```go +// In fields.go — add your field keys +var ( + // The strategy used by my feature. + MyFeatureStrategyKey = AttributeKey{ + Key: attribute.Key("myfeature.strategy"), + Classification: SystemMetadata, + Purpose: FeatureInsight, + } + + // The number of items processed. + MyFeatureItemCountKey = AttributeKey{ + Key: attribute.Key("myfeature.item.count"), + Classification: SystemMetadata, + Purpose: PerformanceAndHealth, + IsMeasurement: true, + } +) +``` + +### Data Classifications + +| Classification | Use When | +|----------------|----------| +| `SystemMetadata` | Non-personal system/config data (most common) | +| `EndUserPseudonymizedInformation` | User identifiers that are hashed (e.g., machine ID) | +| `OrganizationalIdentifiableInformation` | Org identifiers (subscription ID, tenant ID) | +| `PublicPersonalData` | Data the user made public | +| `CallstackOrException` | Stack traces or error details | +| `CustomerContent` | User-created content — highest sensitivity, avoid if possible | + +### Purposes + +| Purpose | Use When | +|---------|----------| +| `FeatureInsight` | Understanding feature adoption and usage patterns | +| `BusinessInsight` | Business metrics (users, orgs, growth) | +| `PerformanceAndHealth` | Performance, errors, reliability | + +### Hashing Sensitive Values + +If your field contains user-generated names or identifiers, **hash it**: + +```go +// In your code, use StringHashed instead of regular attribute setting +tracing.SetUsageAttributes( + fields.StringHashed(fields.MyFeatureNameKey, userProvidedName), +) +``` + +## Step 3: Instrument Your Code + +### For Command Actions + +The telemetry middleware (`cmd/middleware/telemetry.go`) automatically creates a span for every command. You just need to add your feature-specific attributes: + +```go +func (a *myAction) Run(ctx context.Context) (*actions.ActionResult, error) { + // Set usage attributes — these get attached to the command span + tracing.SetUsageAttributes( + fields.MyFeatureStrategyKey.String("incremental"), + fields.MyFeatureItemCountKey.Int(len(items)), + ) + + // ... do work ... + + return &actions.ActionResult{...}, nil +} +``` + +### For Sub-Operations (Child Spans) + +If your feature has distinct sub-operations worth tracking separately: + +```go +func (s *myService) ProcessItems(ctx context.Context, items []Item) error { + ctx, span := tracing.Start(ctx, events.MyFeatureEvent) + defer span.End() + + // Set attributes on this span + span.SetAttributes( + fields.MyFeatureItemCountKey.Int(len(items)), + ) + + // ... do work ... + + if err != nil { + span.SetStatus(codes.Error, err.Error()) + return err + } + + return nil +} +``` + +### For Extension Commands + +Extension commands automatically get `ext.run` events. To add structured error reporting: + +```go +// Extensions report errors back to the host +return &azdext.CommandResult{ + Error: &azdext.ServiceError{ + Service: "arm", + StatusCode: resp.StatusCode, + Code: resp.Error.Code, + }, +} +``` + +This maps to result codes like `ext.service.arm.500` in telemetry. + +## Step 4: Update the Telemetry Schema Doc + +**File:** `docs/specs/metrics-audit/telemetry-schema.md` + +Add your new events and fields to the canonical schema reference. This document is the source of truth for what telemetry azd collects and is reviewed during privacy audits. + +## Step 5: GDPR Classification + +The GDPR classify pipeline in `azd-queries` automatically reads event and field definitions from the azure-dev source. After your changes merge: + +1. The scheduled pipeline (`eng/pipelines/classify.yml`) picks up new events/fields +2. It extracts metadata from `events.go` and `fields/` +3. It publishes to the GDPR API under product code `ai.devcliapprequests` + +**What you need to do:** + +- Ensure every new field has correct `Classification` and `Purpose` values +- If your field has sensitivity higher than `SystemMetadata`, consult the [Privacy Review Checklist](../specs/metrics-audit/privacy-review-checklist.md) +- If you're adding `CustomerContent` or unhashed PII, a formal privacy review is required before merge + +## Step 6: Add Queries and Dashboard Coverage + +### KQL Queries (azd-queries repo) + +If your feature needs specific monitoring, add KQL queries: + +```kql +// Example: My feature usage by strategy +getAzdEvents(startDate=ago(30d), endDate=now(), true, true) +| where Name == 'myfeature.execute' +| extend Strategy = tostring(Properties['myfeature.strategy']) +| summarize Users = dcount(MachineId), Executions = count() by Strategy +| order by Users desc +``` + +### Kusto Functions (azure-dev-tools repo) + +For reusable analysis, add a Kusto function in `product-telemetry/azd/Kusto/Functions/`: + +1. Create `getMyFeatureEvents.kql` or `calcMyFeatureMetrics.kql` +2. Follow naming conventions: `get*` for retrieval, `calc*` for aggregation, `add*` for enrichment +3. Test in Kusto Explorer +4. Submit PR — the LENS job deploys after merge + +### Power BI Reports + +If your feature warrants dashboard coverage: + +1. Add or update reports in `product-telemetry/azd/PowerBI/` +2. Use the deployed Kusto functions as data sources + +## Step 7: Product Review + +Before merging your feature PR: + +1. **Share the telemetry spec** with your PM — explain what events/fields you're adding and why +2. **Show sample queries** — demonstrate how the data answers product questions +3. **Update the data reference** — add your feature's fields to `docs/reference/telemetry-data.md` + +This ensures product can provide feedback during development, not scramble after launch. + +## Quick Reference: Where Things Live + +| What | Where | File/Path | +|------|-------|-----------| +| Event name constants | azure-dev | `cli/azd/internal/tracing/events/events.go` | +| Field key definitions | azure-dev | `cli/azd/internal/tracing/fields/fields.go` | +| Hashing helpers | azure-dev | `cli/azd/internal/tracing/fields/key.go` | +| Telemetry middleware | azure-dev | `cli/azd/cmd/middleware/telemetry.go` | +| Telemetry pipeline init | azure-dev | `cli/azd/internal/telemetry/telemetry.go` | +| Error classification | azure-dev | `cli/azd/internal/cmd/errors.go` (MapError) | +| Canonical schema | azure-dev | `docs/specs/metrics-audit/telemetry-schema.md` | +| Privacy review checklist | azure-dev | `docs/specs/metrics-audit/privacy-review-checklist.md` | +| GDPR classify pipeline | azd-queries | `eng/pipelines/classify.yml` | +| GDPR tool | azd-queries | `eng/tools/gdpr/` | +| KQL query library | azd-queries | `core-usage/`, `insights-and-segments/` | +| Kusto functions | azure-dev-tools | `product-telemetry/azd/Kusto/Functions/` | +| Power BI reports | azure-dev-tools | `product-telemetry/azd/PowerBI/` | + +## See Also + +- [Architecture](../architecture/telemetry.md) — How the telemetry system works end-to-end +- [Data Reference](../reference/telemetry-data.md) — Complete schema, events, fields, query patterns +- [Dashboards & Reports](../reference/telemetry-dashboards.md) — Analysis layer details +- [Privacy Review Checklist](../specs/metrics-audit/privacy-review-checklist.md) — When to do privacy reviews diff --git a/docs/guides/telemetry-overview.md b/docs/guides/telemetry-overview.md new file mode 100644 index 00000000000..8849b24e95e --- /dev/null +++ b/docs/guides/telemetry-overview.md @@ -0,0 +1,113 @@ +# azd Telemetry — Product Overview + +> What azd telemetry tells us, where to find it, and how to work with it. +> + + +## What azd Telemetry Captures + +Azure Developer CLI (azd) collects anonymous usage telemetry to understand how developers use the tool, measure feature adoption, and diagnose issues at scale. Users can opt out at any time. + +### What We Collect + +| Category | Examples | +|----------|---------| +| **Commands** | Which commands are run (`init`, `deploy`, `provision`, `up`), success/failure, duration | +| **Features** | Feature-specific properties (template used, packaging format, auth method, target Azure services) | +| **Errors** | Error codes and categories (ARM errors, auth failures, build failures), **not** user content | +| **Environment** | OS, azd version, execution environment (GitHub Actions, Azure Pipelines, VS Code, etc.) | +| **Extensions** | Which extensions are installed and invoked, extension errors | +| **Performance** | Operation duration, time spent waiting for user input vs. executing | + +### What We Don't Collect + +- No source code or project file contents +- No Azure credentials, tokens, or connection strings +- No personal names, emails, or IP addresses +- Project names and template names are **hashed** (one-way) — we can count unique projects but can't see what they're called +- Users opt out via `azd config set defaults.collectTelemetry no` or `AZURE_DEV_COLLECT_TELEMETRY=no` + +## Key Metrics + +| Metric | What It Measures | Where to Find It | +|--------|-----------------|------------------| +| **MAU** (Monthly Active Users) | Unique users per month (by hashed machine ID) | KPIs dashboard | +| **MEU** (Monthly Engaged Users) | Users who run engagement commands (provision, deploy, up) | KPIs dashboard | +| **MDU** (Monthly Dev Users) | Users in local dev environments (not CI/CD) | KPIs dashboard | +| **Success Rate** | % of command executions that succeed | KPIs dashboard, per-command | +| **Error Rate by Category** | Top error categories (ARM, auth, build, network) | Template Health dashboard | +| **Template Adoption** | Which templates are used, by how many users | Template KPIs dashboard | +| **Funnel Completion** | % of users completing init → provision → deploy | User Journeys dashboard | +| **Retention** | Users returning week-over-week / month-over-month | KPIs dashboard | +| **New Users** | First-time users per time period | KPIs dashboard | +| **Provision/Deploy Duration** | P50/P90 operation time by template | Deploy and Provision dashboard | + +## Where to Find Dashboards + +| Dashboard | Link | What It Shows | +|-----------|------|---------------| +| **Main Dashboard** | [aka.ms/azd/dashboard](https://aka.ms/azd/dashboard) | Primary Power BI report with KPIs, template health, user journeys | +| **Dashboard Collection** | [aka.ms/azd-dashboards](https://aka.ms/azd-dashboards) | All azd-related dashboards | + +### Dashboard Areas + +| Area | What You'll Find | +|------|-----------------| +| **KPIs** | MAU/MEU/MDU trends, success rates, new users | +| **Template KPIs** | Per-template adoption, success, performance | +| **Template Health** | Error rates, failure patterns, top issues per template | +| **Deploy and Provision** | Operation analysis: durations, errors, Azure services used | +| **User Journeys** | Workflow funnels (init → provision → deploy) | +| **Customer Exploration** | Customer-specific usage exploration | +| **AI Foundry** | AI Foundry template metrics | +| **MCP Tools** | Model Context Protocol tool usage | + +## How to Request New Telemetry + +### For a New Feature + +Work with the feature engineer during development: + +1. **During design** — Discuss what questions you want to answer about the feature +2. **During implementation** — Engineer instruments telemetry following the [Feature Telemetry Guide](feature-telemetry.md) +3. **During PR review** — Review the telemetry fields to ensure they answer your product questions +4. **After launch** — Verify data is flowing and dashboards are updated + +### For Additional Metrics on an Existing Feature + +1. File an issue describing: + - What question you want to answer + - What data you think is needed + - Which feature/commands it relates to +2. Engineering evaluates whether: + - The data already exists and just needs a query/dashboard + - New instrumentation is required (code change) + - New Kusto functions or reports are needed + +### Who to Contact + +The telemetry pipeline spans multiple layers. Depending on what you need: + +| Need | Who Can Help | +|------|-------------| +| New code instrumentation | Feature engineer + telemetry reviewer | +| New KQL queries or Kusto functions | PM + Feature Engineer | +| New/updated Power BI reports | PM | +| GDPR/privacy review | Automatic via pipeline; manual review only for `CustomerContent` or unhashed PII classifications | + +## Privacy and Compliance + +- All telemetry is anonymous — no PII is collected +- Identifiers (machine ID, project name, etc.) are one-way hashed +- Users can opt out at any time +- GDPR classification pipeline automatically processes all telemetry fields +- Data retention follows Microsoft standard telemetry retention policies + +## Further Reading + +| Document | What It Covers | Audience | +|----------|---------------|----------| +| [Architecture](../architecture/telemetry.md) | End-to-end system architecture with diagrams | Engineering, Product | +| [Data Reference](../reference/telemetry-data.md) | Complete schema, all events and fields, KQL examples | Engineering, Product | +| [Feature Telemetry Guide](feature-telemetry.md) | How to add telemetry to new features | Engineering | +| [Dashboards & Reports](../reference/telemetry-dashboards.md) | Power BI reports, Kusto functions, analysis tools | Engineering, Product | diff --git a/docs/reference/telemetry-dashboards.md b/docs/reference/telemetry-dashboards.md new file mode 100644 index 00000000000..fdeb3bd53ad --- /dev/null +++ b/docs/reference/telemetry-dashboards.md @@ -0,0 +1,196 @@ +# Dashboards & Reports — azd Product Analysis Layer + +> Reference for the Power BI reports, Kusto functions, and analysis tools built on azd telemetry. +> + + +## Overview + +The analysis layer sits on top of raw telemetry data and provides dashboards, KPIs, and investigation tools. It lives in the [`azure-dev-tools`](https://github.com/coreai-microsoft/azure-dev-tools) repository under `product-telemetry/azd/`. + +🔗 **Dashboard:** [aka.ms/azd/dashboard](https://aka.ms/azd/dashboard) + +## How It Connects + +```mermaid +flowchart LR + subgraph Raw ["Raw Telemetry (Kusto)"] + RT["RawEventsAppRequests"] + TT["Templates"] + end + + subgraph Functions ["Kusto Functions (deployed)"] + Get["getAzdEvents
getAzdArmEvents
getAzdInstallers"] + Add["addTemplateColumns
addCustomerColumns
addExecutionTimeColumns"] + Calc["calcAzdOperations
calcFirstSuccessful
calcNeverBeforeSeen"] + end + + subgraph Analysis ["Analysis Artifacts"] + PBI["Power BI Reports"] + KPI["KPI Tables
(AzdKPIs)"] + Inv["Investigations"] + end + + RT --> Get --> Add --> Calc + TT --> Add + Calc --> PBI + Calc --> KPI + RT --> Inv +``` + +## Kusto Functions + +Deployed to **`DDAzureClients.DevCli`** under the `azd` folder via a [LENS job](https://lens.msftcloudes.com/#/job/24ce3f0fd3d6499ab8a0d85d0c0c05e2). + +**Location:** `product-telemetry/azd/Kusto/Functions/` + +### Naming Conventions + +| Prefix | Purpose | Examples | +|--------|---------|----------| +| `get*` | Retrieves raw or filtered data | `getAzdEvents`, `getAzdArmEvents`, `getAzdInstallers` | +| `add*` | Enriches data with additional columns | `addTemplateColumns`, `addCustomerColumns`, `addExecutionTimeColumns` | +| `calc*` | Calculates metrics, aggregations, or KPIs | `calcAzdOperations`, `calcFirstSuccessfulExecution` | +| `flag*` | Adds boolean flags for filtering | `flagTestAzSubs` | + +### Core Functions + +#### Data Retrieval + +| Function | Description | Key Parameters | +|----------|-------------|----------------| +| `getAzdEvents(...)` | Base query for all azd events. Filters `RawEventsAppRequests` by date range, local clients, daily builds, and minimum version. | `startDate`, `endDate`, `excludeLocalClients`, `excludeDailyBuilds`, `minVersion` | +| `getAzdArmEvents(...)` | ARM deployment-specific events | Same as `getAzdEvents` | +| `getAzdArmPreflightErrors(...)` | ARM preflight validation errors | Date range | +| `getAzdInstallers(...)` | Installation method data | Date range | +| `getAzdTemplatesByLanguage(...)` | Template usage by language | Date range | +| `getAzdCustomerDetails(...)` | Customer metadata enrichment | Date range | +| `getAzdCcids(...)` | Customer CIDs | Date range | +| `getAzdAzSubDetails(...)` | Azure subscription details | Date range | +| `getAzdMcpToolCalls_VSCode(...)` | MCP tool calls from VS Code | Date range | + +#### Data Enrichment (`add*`) + +| Function | What It Adds | Usage | +|----------|-------------|-------| +| `addTemplateColumns` | `TemplateName`, `TemplateRepo` — resolves hashed template IDs to names via `Templates` table | `\| invoke addTemplateColumns()` | +| `addTemplateName` | Just the template name (lighter weight) | `\| invoke addTemplateName()` | +| `addCustomerColumns` | Customer/organization details | `\| invoke addCustomerColumns()` | +| `addCustomerTpid` | Customer TPID (top-parent ID) | `\| invoke addCustomerTpid()` | +| `addCustomerAccountManager` | Account manager info | `\| invoke addCustomerAccountManager()` | +| `addAzSubColumns` | Azure subscription metadata | `\| invoke addAzSubColumns()` | +| `addExecutionTimeColumns` | `ExecutionTimeMs = DurationMs - perf.interact_time` | `\| invoke addExecutionTimeColumns()` | +| `addAzdAndArmErrorDetails` | Enriched error categorization for ARM errors | `\| invoke addAzdAndArmErrorDetails()` | + +#### Calculations (`calc*`) + +| Function | What It Calculates | +|----------|-------------------| +| `calcAzdOperations(...)` | Operation-level metrics (success rates, durations) | +| `calcAzdProvisionDurationByTemplate(...)` | Provision duration percentiles per template | +| `calcAzdDeploymentDurationByTemplate(...)` | Deployment duration percentiles per template | +| `calcAzdProvisionErrorsByTemplate(...)` | Top provision errors per template | +| `calcAzdDeploymentErrorsByTemplate(...)` | Top deployment errors per template | +| `calcAzdOperationDurationByTemplate(...)` | Overall operation duration per template | +| `calcAzdProvisionsByAzService(...)` | Provision counts by Azure service type | +| `calcAzdProvisionsByAzServiceAndTemplate(...)` | Provisions by service and template | +| `calcAzdProvisionsByAzService_FoundryAndAi(...)` | AI/Foundry-specific provision analysis | +| `calcDailyAzdProvisionsByTemplate(...)` | Daily provision counts per template | +| `calcDailyAzdDeploymentsByTemplate(...)` | Daily deployment counts per template | +| `calcFirstSuccessfulExecution(...)` | First successful execution per user/template | +| `calcFirstSuccessfulProvisionByServiceAndTemplate(...)` | First successful provision by service and template | +| `calcNeverBeforeSeenUsersForAzd(...)` | New users (never seen before) | +| `calcNeverBeforeSeenAzdDevDeviceIds(...)` | New devices | +| `calcNeverBeforeSeenAzdAzSubs(...)` | New Azure subscriptions | +| `calcNeverBeforeSeenAzdTemplates(...)` | New templates | + +#### Utility + +| Function | Purpose | +|----------|---------| +| `flagTestAzSubs` | Flags known test/internal Azure subscriptions | +| `getAllAzdTemplateNames` | Lists all known template names | +| `getAllAzServiceProvidersAndTypes` | Lists all Azure service providers and resource types | + +### KPI Subfunctions + +Additional functions under `Kusto/Functions/WeeklyKPIs/` and `Kusto/Functions/MonthlyKPIs/` compute periodic KPI rollups. + +### Adding a New Function + +1. Create a `.kql` file in `product-telemetry/azd/Kusto/Functions/` +2. Follow the naming convention (`get*`, `add*`, `calc*`, `flag*`) +3. Include a comment header explaining the function +4. Test in [Kusto Explorer](https://dataexplorer.azure.com/clusters/ddazureclients/databases/DevCli) +5. Submit a PR — the LENS job deploys after merge + +## Power BI Reports + +**Location:** `product-telemetry/azd/PowerBI/` + +Reports are organized by topic area using PBIP (Power BI Project) format. + +| Report Area | What It Covers | +|------------|----------------| +| **About** | Overview and documentation about the report suite | +| **KPIs** | Core KPI dashboards (MAU, MEU, MDU, success rates) | +| **Template KPIs** | Per-template adoption, success, and performance metrics | +| **Template Health** | Template error rates, failure patterns, top issues | +| **Deploy and Provision** | Deployment and provisioning operation analysis | +| **User Journeys** | User workflow patterns (init → provision → deploy funnels) | +| **Customer Exploration** | Customer-specific usage pattern exploration | +| **AI Foundry** | AI Foundry template usage and metrics | +| **Azure.AI.Agents** | Azure AI Agents telemetry | +| **MCP Tools** | Model Context Protocol tool usage | +| **On Demand Explorations** | Ad-hoc exploration reports | + +### Adding or Updating Reports + +1. Add your `.pbip` file to the appropriate subfolder in `PowerBI/` +2. For new categories, create a new folder with a descriptive name +3. Use deployed Kusto functions as data sources for maintainability +4. Add a README in the folder explaining the report's purpose + +## KQL Query Library (azd-queries repo) + +The [`azd-queries`](https://github.com/devdiv-azure-service-dmitryr/azd-queries) repo contains standalone KQL queries used for dashboards and analysis. These are separate from the deployed Kusto functions. + +| Folder | What It Contains | +|--------|-----------------| +| `core-usage/` | MAU/MEU/MDU, funnels, retention, first-success, subscriptions | +| `insights-and-segments/` | Usage by language, template, installer, client, tenure, quota errors | +| `aspire/` | .NET Aspire-specific telemetry | +| `vscode/` | VS Code extension telemetry queries | +| `paas-retention/` | PaaS retention analysis | +| `cu-analysis/` | Consumption unit analysis | + +## Investigations + +**Location:** `product-telemetry/azd/Kusto/Investigations/` + +Ad-hoc investigation queries for specific debugging scenarios or customer explorations. These are not deployed as functions but serve as reference for common investigation patterns. + +## Reports + +**Location:** `product-telemetry/azd/Reports/` + +Written analysis documents: + +| Report | Description | +|--------|-------------| +| `unknown-errors-report.md` | Analysis of unclassified errors in azd telemetry | +| `provision-errors-report.md` | Deep-dive into provision failure patterns | +| `azd-state-of-the-product-feb-2026.kqlx` | State of the product analysis | + +## Funnel Metrics Framework + +**Location:** `product-telemetry/azd/Kusto/funnel-metrics/` + +A framework for defining and computing user funnels (e.g., init → provision → deploy). See `funnel-metrics/README.md` for the model and data sources. + +## See Also + +- [Architecture](../architecture/telemetry.md) — How telemetry flows end-to-end +- [Data Reference](telemetry-data.md) — Schema, events, fields, query patterns +- [Feature Telemetry Guide](../guides/feature-telemetry.md) — Adding telemetry to new features +- [Telemetry Overview](../guides/telemetry-overview.md) — For product managers and leadership diff --git a/docs/reference/telemetry-data.md b/docs/reference/telemetry-data.md new file mode 100644 index 00000000000..57607331454 --- /dev/null +++ b/docs/reference/telemetry-data.md @@ -0,0 +1,595 @@ +# Telemetry Data Reference — Understanding & Querying azd Telemetry + +> Schema reference for all azd telemetry events, fields, and Kusto tables. +> Use this to understand what data exists and how to query it. +> + + +## Kusto Tables + +All azd telemetry lands in Azure Data Explorer (Kusto): + +- **Cluster:** `DDAzureClients` +- **Database:** `DevCli` +- **Primary table:** `RawEventsAppRequests` +- **Supplementary tables:** `Templates`, `TemplateVersions`, `AzdKPIs` + +### RawEventsAppRequests — Core Columns + +| Column | Type | Description | +|--------|------|-------------| +| `TimeGenerated` | datetime | When the event was recorded | +| `Name` | string | Event/span name (e.g., `cmd.deploy`, `ext.run`) | +| `DurationMs` | real | Total span duration in milliseconds | +| `Success` | bool | Whether the operation succeeded | +| `ResultCode` | string | Error classification code (e.g., `Success`, `service.arm.500`, `internal.unclassified`) | +| `OperationId` | string | Unique ID for the top-level command invocation | +| `Properties` | dynamic | String/bool span attributes (JSON bag) | +| `Measurements` | dynamic | Numeric span attributes (JSON bag) | +| `AppVersion` | string | azd CLI version | + +### Accessing Properties and Measurements + +```kql +// String properties +| extend TemplateId = tostring(Properties['project.template.id']) + +// Numeric measurements +| extend InteractTimeMs = toreal(Measurements['perf.interact_time']) + +// Computed execution time (excludes user interaction) +| extend ExecutionTimeMs = DurationMs - toreal(Measurements['perf.interact_time']) +``` + +## Events Reference + +Events are defined in `cli/azd/internal/tracing/events/events.go`. Each event becomes a span `Name` in Kusto. + +### Core Command Events (`cmd.*`) + +Commands follow the pattern `cmd.` where spaces become dots. + +| Event Pattern | Example | Description | +|--------------|---------|-------------| +| `cmd.` | `cmd.init`, `cmd.up`, `cmd.deploy` | Top-level command execution | +| `cmd..` | `cmd.auth.login`, `cmd.env.new` | Subcommand execution | +| `cmd...` | `cmd.pipeline.config` | Deeper subcommands | + +**Common command events:** +- `cmd.init` — project initialization +- `cmd.up` — full provision + deploy cycle +- `cmd.provision` — infrastructure provisioning +- `cmd.deploy` — application deployment +- `cmd.package` — application packaging +- `cmd.down` — resource teardown +- `cmd.auth.login` — authentication +- `cmd.env.new` / `cmd.env.select` — environment management +- `cmd.pipeline.config` — CI/CD pipeline setup +- `cmd.monitor` — monitoring +- `cmd.restore` — dependency restoration + +### Extension Events (`ext.*`) + +| Event | Description | +|-------|-------------| +| `ext.run` | Extension command execution | +| `ext.install` | Extension installation | +| `ext.upgrade` | Extension upgrade attempt | +| `ext.promote` | Registry promotion (e.g., dev → main) | + +### Agent & Copilot Events + +| Event | Description | +|-------|-------------| +| `agent.troubleshoot` | Agent troubleshooting session | +| `copilot.initialize` | Copilot agent initialization | +| `copilot.session` | Copilot session creation/resumption | + +### MCP Events (`mcp.*`) + +| Event Pattern | Description | +|--------------|-------------| +| `mcp.` | MCP tool invocation | + +### Infrastructure Events (`arm.*`) + +| Event | Description | +|-------|-------------| +| `arm.deploy.subscription` | ARM deployment at subscription scope | +| `arm.deploy.resourcegroup` | ARM deployment at resource group scope | +| `arm.stack.deploy.subscription` | ARM deployment stack at subscription scope | +| `arm.stack.deploy.resourcegroup` | ARM deployment stack at resource group scope | +| `arm.whatif.subscription` | ARM what-if at subscription scope | +| `arm.whatif.resourcegroup` | ARM what-if at resource group scope | +| `arm.validate.subscription` | ARM validation at subscription scope | +| `arm.validate.resourcegroup` | ARM validation at resource group scope | + +### Other Events + +| Event | Description | +|-------|-------------| +| `tools.pack.build` | Cloud Native Buildpacks build | +| `validation.preflight` | Local preflight validation | +| `hooks.exec` | Lifecycle hook execution | +| `aks.postprovision.skip` | AKS postprovision hook skipped | +| `deploy.appservice.zip` | App Service zip deployment | +| `container.credentials` | Container registry credential retrieval | +| `container.publish` | Container image publish | +| `container.remotebuild` | Remote container build | +| `exegraph.run` | Execution graph run (parallel operations) | +| `exegraph.step` | Single step within execution graph | + +### VS Code Extension Events (`azure-dev.*`) + +These are emitted by the VS Code extension via the VS Code telemetry framework (separate from CLI telemetry). + +| Event | Description | +|-------|-------------| +| `azure-dev.activate` | Extension activated | +| `azure-dev.deactivate` | Extension deactivated | +| `azure-dev.tasks.dotenv` | Dotenv task executed | +| `azure-dev.commands.` | CLI command tasks (deploy, provision, up, down, init, login, restore, package) | +| `azure-dev.survey-check` | Survey eligibility check | +| `azure-dev.survey-prompt-response` | Survey prompt user response | + +### VS RPC Events (`vsrpc.*`) + +JSON-RPC events for VS Code ↔ azd communication. Follow the pattern `vsrpc.`. + +## Fields Reference + +Fields appear as `Properties` (strings/bools) or `Measurements` (numbers) in the Kusto table. + +### Application-Level Fields (Every Event) + +These are set once at process startup and attached to **every** span. + +| Field Key | Type | Description | Example Values | +|-----------|------|-------------|----------------| +| `service.name` | string | Always `"azd"` | `azd` | +| `service.version` | string | CLI version | `1.23.5` | +| `os.type` | string | Operating system | `linux`, `windows`, `darwin` | +| `os.version` | string | OS version | `10.0.22621`, `14.5` | +| `host.arch` | string | CPU architecture | `amd64`, `arm64` | +| `process.runtime.version` | string | Go runtime version | `go1.26.0` | +| `machine.id` | string | MAC address hash (pseudonymized) | SHA-256 hash | +| `machine.devdeviceid` | string | SQM device ID | UUID string | +| `execution.environment` | string | Where azd is running | See [Execution Environments](#execution-environments) | +| `service.installer` | string | How azd was installed | `msi`, `brew`, `choco`, `rpm`, `deb` | + +### Identity & Account Fields + +| Field Key | Type | Description | +|-----------|------|-------------| +| `user_AuthenticatedId` | string | Entra ID Object ID | +| `ad.tenant.id` | string | Entra ID Tenant ID | +| `ad.account.type` | string | `User` or `Service Principal` | +| `ad.subscription.id` | string | Azure Subscription ID | + +### Project Context Fields + +| Field Key | Type | Hashed? | Description | +|-----------|------|---------|-------------| +| `project.template.id` | string | ✅ SHA-256 | Template identifier from `azure.yaml` metadata | +| `project.template.version` | string | ✅ SHA-256 | Template version | +| `project.name` | string | ✅ SHA-256 | Project name | +| `project.service.hosts` | string[] | ❌ | Host types (e.g., `appservice`, `containerapp`) | +| `project.service.targets` | string[] | ❌ | Resolved deployment targets | +| `project.service.languages` | string[] | ❌ | Languages across all services | +| `project.service.language` | string | ❌ | Language of specific service being executed | +| `platform.type` | string | ❌ | Platform integration (e.g., `aca`, `aks`) | +| `env.name` | string | ✅ SHA-256 | Environment name | + +> **Joining with template names:** Template IDs are hashed. To resolve to human-readable names, +> join with the `Templates` table using `project.template.id` = `Templates.Hash`. +> The `addTemplateColumns` Kusto function does this automatically. + +### Command Entry-Point Fields + +| Field Key | Type | Description | +|-----------|------|-------------| +| `cmd.flags` | string[] | Flag names that were set (values not recorded) | +| `cmd.args.count` | measurement | Number of positional arguments | +| `cmd.entry` | string | How the command was invoked (formatted as event name) | + +### Error Fields + +| Field Key | Type | Description | +|-----------|------|-------------| +| `error.category` | string | High-level error category | +| `error.code` | string | Specific error code | +| `error.type` | string | Same as `ResultCode` — the classified error type | +| `error.chain.types` | string[] | Full Go error type chain, outermost first | + +#### Error Classification (ResultCode Taxonomy) + +The `ResultCode` field classifies errors into categories. Understanding this taxonomy is essential for querying failures. + +| Pattern | Category | Example | +|---------|----------|---------| +| `Success` | No error | — | +| `user.canceled` | User cancelled the operation | — | +| `service.arm.` | ARM service error | `service.arm.500`, `service.arm.409` | +| `service.aad.` | Entra ID (AAD) error | `service.aad.failed` | +| `service..` | Other Azure service error | `service.graph.403` | +| `tool..` | External tool error | `tool.docker.1` | +| `ext.service..` | Extension service error | `ext.service.arm.500` | +| `ext.validation.*` | Extension validation error | `ext.validation.config` | +| `ext.auth.*` | Extension auth error | `ext.auth.expired` | +| `ext.dependency.*` | Extension dependency error | `ext.dependency.missing` | +| `internal.unclassified` | Catch-all for unclassified errors | — | +| `internal.errors_errorString` | Legacy catch-all (being replaced by `internal.unclassified`) | — | + +> **⚠️ Known gap:** Many errors historically fall into `internal.errors_errorString` / `internal.unclassified` +> because the error classifier only inspects the leaf error type. Work to improve this is tracked in +> [azure-dev#8011](https://github.com/Azure/azure-dev/issues/8011) (error chain + classifier + origin context). + +### Service Attributes (Azure API Calls) + +| Field Key | Type | Description | +|-----------|------|-------------| +| `service.host` | string | Azure service host | +| `service.name` | string | Azure service name | +| `service.statusCode` | measurement | HTTP status code | +| `service.method` | string | HTTP method | +| `service.errorCode` | measurement | Service-specific error code | +| `service.correlationId` | string | Azure correlation ID | + +### Performance Fields + +| Field Key | Type | Description | +|-----------|------|-------------| +| `perf.interact_time` | measurement | Time (ms) spent waiting for user input | + +> **Computing execution time:** `ExecutionTimeMs = DurationMs - Measurements['perf.interact_time']` +> This gives you the actual processing time, excluding user interaction (prompts, confirmations). + +### Feature-Specific Fields + +
+Authentication + +| Field Key | Type | Values | +|-----------|------|--------| +| `auth.method` | string | `browser`, `device-code`, `service-principal-secret`, `service-principal-certificate`, `federated-github`, `federated-azure-pipelines`, `federated-oidc`, `managed-identity`, `external`, `oneauth`, `check-status` | +
+ +
+Init / App Init + +| Field Key | Type | Description | +|-----------|------|-------------| +| `init.method` | string | `template`, `app`, `project`, `environment`, `copilot` | +| `appinit.detected.databases` | string[] | Databases detected during init | +| `appinit.detected.services` | string[] | Services detected during init | +| `appinit.confirmed.databases` | string[] | Databases confirmed by user | +| `appinit.confirmed.services` | string[] | Services confirmed by user | +| `appinit.modify_add.count` | measurement | Services added during modification | +| `appinit.modify_remove.count` | measurement | Services removed during modification | +| `appinit.lastStep` | string | Last init step reached | +
+ +
+Hooks + +| Field Key | Type | Description | +|-----------|------|-------------| +| `hooks.name` | string | Hook name (e.g., `preprovision`, `postdeploy`). Custom hooks are SHA-256 hashed. | +| `hooks.type` | string | Scope: `project`, `service`, or `layer` | +| `hooks.kind` | string | Executor: `sh`, `pwsh`, `python`, `js`, `ts`, `dotnet` | +
+ +
+Pipeline Config + +| Field Key | Type | Description | +|-----------|------|-------------| +| `pipeline.provider` | string | `github`, `azdo`, `auto` | +| `pipeline.auth` | string | `federated`, `client-credentials`, `auto` | +
+ +
+Infrastructure + +| Field Key | Type | Description | +|-----------|------|-------------| +| `infra.provider` | string | `bicep`, `terraform`, `auto` | +
+ +
+Deployment + +| Field Key | Type | Description | +|-----------|------|-------------| +| `deploy.appservice.attempt` | measurement | Retry attempt number for App Service zip deploy | +| `deploy.appservice.linux` | string | Whether deploying to Linux App Service | +
+ +
+Preflight Validation + +| Field Key | Type | Description | +|-----------|------|-------------| +| `validation.preflight.outcome` | string | `passed`, `warnings_accepted`, `aborted_by_errors`, `aborted_by_user`, `skipped`, `error` | +| `validation.preflight.diagnostics` | string[] | Diagnostic IDs emitted | +| `validation.preflight.rules` | string[] | Rule IDs executed | +| `validation.preflight.warning.count` | measurement | Number of warnings | +| `validation.preflight.error.count` | measurement | Number of errors | +
+ +
+Provision Cancellation + +| Field Key | Type | Description | +|-----------|------|-------------| +| `provision.cancellation` | string | `none`, `leave_running`, `canceled`, `cancel_timed_out`, `cancel_timed_out_nested`, `cancel_raced_succeeded`, `cancel_raced_failed`, `cancel_raced_deleted`, `cancel_too_late`, `cancel_failed` | +
+ +
+Copilot + +| Field Key | Type | Description | +|-----------|------|-------------| +| `copilot.session.id` | string | Session identifier | +| `copilot.session.isNew` | string | Whether this is a new session | +| `copilot.session.messageCount` | measurement | Messages in session | +| `copilot.init.isFirstRun` | string | First copilot run | +| `copilot.init.reasoningEffort` | string | Reasoning effort level | +| `copilot.init.model` | string | Model used | +| `copilot.init.consentScope` | string | Consent scope | +| `copilot.mode` | string | Copilot mode | +| `copilot.message.model` | string | Model for specific message | +| `copilot.message.inputTokens` | measurement | Input token count | +| `copilot.message.outputTokens` | measurement | Output token count | +| `copilot.message.billingRate` | measurement | Billing rate | +| `copilot.message.premiumRequests` | measurement | Premium request count | +| `copilot.message.durationMs` | measurement | Message duration | +| `copilot.consent.approvedCount` | measurement | Approved consent actions | +| `copilot.consent.deniedCount` | measurement | Denied consent actions | +
+ +
+Extensions + +| Field Key | Type | Description | +|-----------|------|-------------| +| `extension.id` | string | Extension identifier | +| `extension.version` | string | Extension version | +| `extension.installed` | string[] | List of installed extensions (`id@version`) | +
+ +
+MCP + +| Field Key | Type | Description | +|-----------|------|-------------| +| `mcp.client.name` | string | MCP client name | +| `mcp.client.version` | string | MCP client version | +
+ +
+Execution Graph + +| Field Key | Type | Description | +|-----------|------|-------------| +| `exegraph.step.count` | measurement | Total steps in graph | +| `exegraph.max_concurrency` | string | Effective concurrency limit | +| `exegraph.error_policy` | string | `fail_fast` or `continue_on_error` | +| `exegraph.step.name` | string | Step name | +| `exegraph.step.deps` | string[] | Step dependencies | +| `exegraph.step.tags` | string[] | Step tags | +
+ +
+Pack (Buildpacks) + +| Field Key | Type | Description | +|-----------|------|-------------| +| `pack.builder.image` | string | Builder image name | +| `pack.builder.tag` | string | Builder image tag | +
+ +
+Update + +| Field Key | Type | Description | +|-----------|------|-------------| +| `update.channel` | string | Update channel | +| `update.installMethod` | string | Installation method | +| `update.fromVersion` | string | Version before update | +| `update.toVersion` | string | Version after update | +| `update.result` | string | Update outcome | +
+ +
+JSON-RPC + +| Field Key | Type | Description | +|-----------|------|-------------| +| `rpc.method` | string | RPC method name | +| `rpc.jsonrpc.request_id` | string | Request ID | +| `rpc.jsonrpc.error_code` | measurement | Error code | +
+ +
+Agent + +| Field Key | Type | Description | +|-----------|------|-------------| +| `agent.fix.attempts` | string | Number of fix attempts | +
+ +### Execution Environments + +The `execution.environment` field identifies where azd is running. Format: `[;;...]` + +| Value | Description | +|-------|-------------| +| `Desktop` | Direct terminal usage | +| `Visual Studio` | VS integration | +| `Visual Studio Code` | VS Code integration | +| `VS Code Azure GitHub Copilot` | Azure Copilot in VS Code | +| `Azure CloudShell` | Azure Cloud Shell | +| `Claude Code` | Claude Code AI agent | +| `GitHub Copilot CLI` | GitHub Copilot CLI | +| `Gemini` | Gemini AI agent | +| `OpenCode` | OpenCode AI agent | +| `GitHub Actions` | GitHub Actions CI | +| `Azure Pipelines` | Azure Pipelines CI | +| `GitHub Codespaces` | GitHub Codespaces | +| Other CI systems | `AppVeyor`, `Bamboo`, `BitBucket Pipelines`, `Travis CI`, `Circle CI`, `GitLab CI`, `Jenkins`, `AWS CodeBuild`, `Google Cloud Build`, `TeamCity`, `JetBrains Space` | + +**Modifier:** `Azure App Spaces Portal` may be appended as a modifier (`;` separated). + +## Data Nuances & Gotchas + +Important things to know when querying azd telemetry data. These are sourced from real investigations and issues. + +### OperationId Reuse in Retry/Troubleshoot Flows + +**Issue:** [azure-dev-pr#1771](https://github.com/Azure/azure-dev-pr/issues/1771) + +When `cmd.up` triggers `agent.troubleshoot` after a failure, the troubleshoot agent may retry the failed operation (e.g., `cmd.deploy`). These retries share the **same OperationId** as the parent `cmd.up` span. + +This means you may see multiple rows with the same `OperationId` and `Name` (e.g., two `cmd.deploy` rows). These are **not duplicate events** — they are retry attempts within a single user session. + +**Example pattern:** +``` +OperationId: 28ce1f2898a4fec84522107e36c22038 + cmd.up (511s, FAIL) + ├── cmd.package ✅ + ├── cmd.provision ✅ + ├── cmd.deploy ❌ (service.arm.500) ← attempt 1 + ├── agent.troubleshoot ✅ (471s) + │ ├── cmd.mcp.start + │ ├── cmd.package ✅ → cmd.provision ✅ ← retry + ├── cmd.deploy ❌ (service.aad.failed) ← attempt 2 + └── cmd.deploy ❌ (service.aad.failed) ← attempt 3 +``` + +**Impact on queries:** +```kql +// ❌ WRONG — counts retries as separate users/invocations +getAzdEvents(...) | where Name == 'cmd.deploy' | summarize count() + +// ✅ CORRECT — count distinct OperationIds to get unique invocations +getAzdEvents(...) | where Name == 'cmd.deploy' | summarize dcount(OperationId) + +// ✅ Or be explicit about only first attempts +getAzdEvents(...) +| where Name == 'cmd.deploy' +| summarize arg_min(TimeGenerated, *) by OperationId +``` + +### The `internal.unclassified` / `internal.errors_errorString` Catch-All + +Many failed commands produce the catch-all result code `internal.errors_errorString` (being renamed to `internal.unclassified`). This happens because the error classifier inspects only the leaf error type, and `errors.New()` / `fmt.Errorf()` without `%w` produce `*errors.errorString`, which has no domain meaning. + +**To investigate these errors:** +1. Check `error.chain.types` (if available, added in [#8011](https://github.com/Azure/azure-dev/issues/8011)) for the full error type chain +2. Correlate with `service.errorCode` or `service.statusCode` for Azure API failures +3. Look at surrounding span context (same `OperationId`) for additional detail + +### Hashed Fields and Template Joins + +Fields like `project.template.id`, `project.name`, `env.name` are **SHA-256 hashed** before emission to protect privacy. You cannot reverse them. + +To resolve template IDs to human-readable names, use the `Templates` table: +```kql +getAzdEvents(...) +| invoke addTemplateColumns() +| project TimeGenerated, TemplateName, Success +``` + +### Execution Time vs Duration + +`DurationMs` includes time the user spent at prompts (confirmations, selections). Use: +```kql +| extend ExecutionTimeMs = DurationMs - toreal(Measurements['perf.interact_time']) +``` + +### Internal vs External Users + +To distinguish Microsoft internal users from external: +```kql +// The addCustomerColumns function enriches with customer details +getAzdEvents(...) | invoke addCustomerColumns() + +// Or filter by tenant/subscription patterns +getAzdEvents(...) | invoke flagTestAzSubs() +``` + +## Common Query Patterns + +### Basic: Command Usage Over Time +```kql +getAzdEvents(startDate=ago(30d), endDate=now(), true, true) +| where Name startswith "cmd." +| summarize Users = dcount(MachineId), Executions = count() by Name +| order by Users desc +``` + +### Feature Adoption: Template Usage +```kql +getAzdEvents(startDate=ago(30d), endDate=now(), true, true) +| where Name == 'cmd.up' and Success +| invoke addTemplateColumns() +| summarize Users = dcount(MachineId) by TemplateName +| order by Users desc +``` + +### Error Analysis: Top Failure Reasons +```kql +getAzdEvents(startDate=ago(7d), endDate=now(), true, true) +| where Name == 'cmd.deploy' and not(Success) +| summarize Count = count() by ResultCode +| order by Count desc +``` + +### Performance: Command Duration (p50/p95) +```kql +getAzdEvents(startDate=ago(30d), endDate=now(), true, true) +| where Name == 'cmd.provision' and Success +| extend ExecutionTimeMs = DurationMs - toreal(Measurements['perf.interact_time']) +| summarize p50 = percentile(ExecutionTimeMs, 50), p95 = percentile(ExecutionTimeMs, 95) by bin(TimeGenerated, 1d) +``` + +### Funnel: Init → Provision → Deploy Success +```kql +let timeRange = ago(30d); +let events = getAzdEvents(startDate=timeRange, endDate=now(), true, true); +let initUsers = events | where Name == 'cmd.init' | summarize by MachineId; +let provisionUsers = events | where Name == 'cmd.provision' and Success | summarize by MachineId; +let deployUsers = events | where Name == 'cmd.deploy' and Success | summarize by MachineId; +print + Init = toscalar(initUsers | count), + Provision = toscalar(provisionUsers | count), + Deploy = toscalar(deployUsers | count) +``` + +## Kusto Functions Reference + +These reusable functions are deployed to `DDAzureClients.DevCli` and simplify common query patterns. +See [Dashboards & Reports](telemetry-dashboards.md) for full details. + +| Function | Purpose | +|----------|---------| +| `getAzdEvents(...)` | Base query: filters `RawEventsAppRequests` by date, local clients, daily builds, min version | +| `getAzdArmEvents(...)` | ARM-specific event query | +| `addTemplateColumns` | Joins `Templates` table to resolve template hashes to names | +| `addCustomerColumns` | Enriches with customer/org details | +| `addAzSubColumns` | Adds Azure subscription metadata | +| `addExecutionTimeColumns` | Adds `ExecutionTimeMs` (duration minus interaction time) | +| `addAzdAndArmErrorDetails` | Enriches error rows with ARM error details | +| `flagTestAzSubs` | Flags known test/internal subscriptions | +| `calcAzdOperations(...)` | Calculates operation-level metrics | +| `calcFirstSuccessfulExecution(...)` | Finds first successful execution per user | +| `calcNeverBeforeSeenUsersForAzd(...)` | Identifies new users | + +## See Also + +- [Architecture](../architecture/telemetry.md) — End-to-end telemetry flow +- [Feature Telemetry Guide](../guides/feature-telemetry.md) — How to add telemetry for new features +- [Dashboards & Reports](telemetry-dashboards.md) — Power BI reports and Kusto functions +- [Telemetry Schema (canonical)](../../specs/metrics-audit/telemetry-schema.md) — Source-of-truth schema in the codebase +- [Privacy Review Checklist](../../specs/metrics-audit/privacy-review-checklist.md) — When and how to do privacy reviews