From 8475ca62b6a7f1c425461cb242b4926dfe86beb6 Mon Sep 17 00:00:00 2001 From: Robert M1 <50460704+githubrobbi@users.noreply.github.com> Date: Tue, 19 May 2026 14:14:49 -0700 Subject: [PATCH] chore(scripts): add build_codegen_audit.sh -- Phase 9a baseline tool (refs #298) Phase 9a baseline inventory tool for the workspace's build scripts, proc-macro crates, declarative macros, codegen binaries, and env-var consumption surface. Mirrors the pattern established by: - `scripts/dev/clone_alloc_audit.sh` (Phase 6a) - `scripts/dev/trait_generic_audit.sh` (Phase 7a) - `scripts/dev/feature_dep_audit.sh` (Phase 8a) Output (per recon at SHA `8a672bba7`): - 1 `build.rs` (`crates/uffs-cli/build.rs`, 108 LOC, MSVC-gated) - 0 proc-macro crates - 6 `macro_rules!` declarations (all `uffs-mft`, `pub(crate)`) - 4 workspace-internal codegen binaries (3 drift-detected via gates.toml, 1 release orchestrator) - 36 distinct env-var names consumed across workspace - 2 `include_*!` use sites (case-fold table in `uffs-text`) Modes: - default: rg-only, ~1 s (prod-globs filter; tests/benches/examples excluded) - `--with-cargo`: + cargo build --workspace --timings (~30 s warm) Companion plan: `docs/dev/architecture/code_clean/phase_9_build_scripts_macros_codegen_implementation_plan.md` (local-only, 276 LOC). Tracking issue: #298. Rule-1 adherence: zero `#[allow]` introductions. The script is a read-only inventory tool; emits markdown to stdout. Excludes test files via prod-globs (matching the Phase 6/7/8 audit-script pattern). Caveats documented inline: - env-var detection requires names \u2265 2 chars + filters comment-prefix lines (drops `X` false-positive from rustdoc prose). - macro_rules! parser captures name but does not classify per playbook \u00a71064 (syntax shaping / impl repetition / pattern capture); classification happens in Phase 9d. --- scripts/dev/build_codegen_audit.sh | 572 +++++++++++++++++++++++++++++ 1 file changed, 572 insertions(+) create mode 100755 scripts/dev/build_codegen_audit.sh diff --git a/scripts/dev/build_codegen_audit.sh b/scripts/dev/build_codegen_audit.sh new file mode 100755 index 000000000..37b8e9f1e --- /dev/null +++ b/scripts/dev/build_codegen_audit.sh @@ -0,0 +1,572 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2025-2026 SKY, LLC. +# +# Phase 9 — Build scripts, macros, and code generation inventory for the +# UFFS workspace. +# +# Companion to: +# - docs/dev/architecture/code_clean/phase_9_build_scripts_macros_codegen_implementation_plan.md +# - scripts/dev/feature_dep_audit.sh (Phase 8a — same shape, different +# pattern set) +# - scripts/dev/trait_generic_audit.sh (Phase 7a — same shape) +# - scripts/dev/clone_alloc_audit.sh (Phase 6a — same shape) +# +# Purpose +# ------- +# Walk every workspace member and emit, **per crate**, the inventory the +# playbook §1013-1078 calls out: +# +# * `build.rs` presence + LOC + target-gate shape + `cargo:` directives +# emitted + env vars read at build-time. +# * `proc-macro = true` declarations (currently 0 workspace-wide; the +# audit script confirms the deliberate non-introduction). +# * `macro_rules!` declarations — name, file, line, scope (`pub` / +# `pub(crate)` / function-local), justification class per playbook +# §1064 (syntax shaping / trait impl repetition / pattern capture). +# * Codegen binaries — workspace-internal generator/validator binaries +# under `scripts/ci/` (`gen-hooks`, `gen-workflow`, `manifest-audit`, +# `ci-pipeline`) + their drift-detector wiring per `gates.toml`. +# * Env-var consumption — every `env::var(…)` / `env!(…)` / +# `option_env!(…)` use site, per env-var-name aggregation. +# * `include_bytes!` / `include_str!` / `include!` use sites — small +# leaf-data embedding (case-fold tables, embedded resources) vs +# codegen-pipeline magic. +# +# Workspace-level inventory: +# * Total `build.rs` count; per-crate gate (target_os / target_env / +# etc) — proves each one is necessary per playbook §1041-1046. +# * Total `proc-macro = true` count (expected: 0). +# * Total `macro_rules!` count, grouped by crate. +# * Codegen binary inventory cross-referenced against +# `scripts/ci/gates.toml` drift detectors. +# * Env-var registry — every distinct name, grouped by scope. +# +# Excludes (because the workspace's `clippy.toml` already relaxes lint +# posture inside these, and build/macro/codegen work is prod-only): +# +# * `tests/`, `benches/`, `examples/` directories under any crate. +# * Files named `tests.rs`, `*_tests.rs`, `*_test.rs`, `test_*.rs`. +# +# `build.rs` IS audited (it's the central artifact of this phase). +# +# Caveats (documented in the output preamble) +# ------------------------------------------- +# 1. The `macro_rules!` parser uses ripgrep + a small line-by-line scan; +# it captures the macro name from `macro_rules!\s+NAME` but does not +# attempt to parse the macro body or measure its complexity. Phase +# 9d's per-macro audit re-classifies each. +# +# 2. Env-var detection uses three regex shapes: `env::var\("…"\)`, +# `env!\("…"\)`, `option_env!\("…"\)`. Build-time envs read via the +# `CARGO_CFG_*` family are captured but not deduped against `env!` +# macros that read the same name. +# +# 3. The `proc-macro = true` check is a grep over each crate's +# `Cargo.toml` `[lib]` table — true if the line is present anywhere +# in the manifest (which is sufficient for a 0-result audit). +# +# Optional cargo cross-checks +# --------------------------- +# Pass `--with-cargo` to also run, in order: +# * `cargo build --workspace --timings` (~30 s warm) +# * `cargo expand` summary on the 6 `macro_rules!` sites (~5 s each) +# +# The default mode (no flag) is rg+awk only and runs in ~1 s. +# +# Usage +# ----- +# scripts/dev/build_codegen_audit.sh # fast (~1 s) +# scripts/dev/build_codegen_audit.sh --with-cargo # + cargo build --timings +# +# Output goes to stdout in Markdown. Redirect to capture: +# +# scripts/dev/build_codegen_audit.sh \ +# > docs/dev/baseline/2026-05-19/phase_9_build_baseline.md +# +# Exit codes +# ---------- +# 0 — script ran to completion. The *counts* of build.rs / macros / +# env vars are information, not a failure signal. +# 1 — fatal scripting error (rg missing, repo root not detectable, +# cargo invocation failed when `--with-cargo` was requested). + +set -uo pipefail + +WITH_CARGO=0 +for arg in "$@"; do + case "$arg" in + --with-cargo) WITH_CARGO=1 ;; + --help | -h) + sed -n '1,90p' "$0" + exit 0 + ;; + *) + echo "ERROR: unknown argument '$arg' (expected --with-cargo | --help)" >&2 + exit 1 + ;; + esac +done + +# ── Locate workspace root ───────────────────────────────────────────── +ROOT="$(git rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "$ROOT" ]] || [[ ! -d "$ROOT/crates" ]]; then + echo "ERROR: not inside the UFFS workspace (expected 'crates/' at git root)" >&2 + exit 1 +fi +cd "$ROOT" || { + echo "ERROR: cd to '$ROOT' failed" >&2 + exit 1 +} + +# ── Required tooling ────────────────────────────────────────────────── +if ! command -v rg >/dev/null 2>&1; then + echo "ERROR: 'rg' (ripgrep) not found in PATH" >&2 + exit 1 +fi +if [[ "$WITH_CARGO" -eq 1 ]] && ! command -v cargo >/dev/null 2>&1; then + echo "ERROR: 'cargo' not found in PATH (required for --with-cargo)" >&2 + exit 1 +fi + +# ── Crate inventory ─────────────────────────────────────────────────── +mapfile -t CRATES < <( + find crates -mindepth 2 -maxdepth 2 -name Cargo.toml \ + | sed -E 's|^crates/([^/]+)/Cargo.toml$|\1|' \ + | sort +) +if [[ ${#CRATES[@]} -eq 0 ]]; then + echo "ERROR: no crates discovered under crates/" >&2 + exit 1 +fi + +# Workspace-internal codegen binaries (under scripts/ci/, not crates/). +mapfile -t CODEGEN_BINS < <( + find scripts/ci -mindepth 2 -maxdepth 2 -name Cargo.toml \ + | sed -E 's|^scripts/ci/([^/]+)/Cargo.toml$|\1|' \ + | sort +) +# Also include scripts/ci-pipeline (the release orchestrator). +if [[ -f "scripts/ci-pipeline/Cargo.toml" ]]; then + CODEGEN_BINS+=("ci-pipeline") +fi + +# ── rg filter (prod-only — but INCLUDING build.rs unlike phase-6/7/8) ─ +RG_PROD_GLOBS=( + -g '*.rs' + -g '!tests/**' + -g '!benches/**' + -g '!examples/**' + -g '!**/tests.rs' + -g '!**/*_tests.rs' + -g '!**/*_test.rs' + -g '!**/test_*.rs' +) + +# Count pattern occurrences across a directory. +count_pattern() { + local dir="$1" + local pattern="$2" + local fixed="${3:-0}" + local rg_flags=("${RG_PROD_GLOBS[@]}" --no-heading --no-filename --count-matches) + if [[ "$fixed" -eq 1 ]]; then + rg_flags+=(-F) + fi + rg "${rg_flags[@]}" "$pattern" "$dir" 2>/dev/null \ + | awk 'BEGIN{s=0} {s+=$1} END{print s+0}' +} + +# Check whether a Cargo.toml declares `proc-macro = true`. Returns +# "yes" or "no". +is_proc_macro_crate() { + local toml="$1" + if grep -q '^proc-macro[[:space:]]*=[[:space:]]*true' "$toml" 2>/dev/null; then + echo "yes" + else + echo "no" + fi +} + +# Extract every `macro_rules! NAME` declaration in a directory. +# Output: "::" +list_macro_rules() { + local dir="$1" + rg "${RG_PROD_GLOBS[@]}" \ + --no-heading -n -o \ + '\bmacro_rules!\s+[A-Za-z_][A-Za-z0-9_]*' \ + "$dir" 2>/dev/null \ + | sed -E 's|macro_rules!\s+||' +} + +# Extract env-var names from a directory, filtering out matches that +# appear inside comments (lines whose first non-whitespace characters +# are `//`, `///`, or `*`). Requires env-var names to be 2+ chars to +# avoid matching single-letter rustdoc placeholders like `X` in prose. +_extract_env_var_names() { + local dir="$1" + { + rg "${RG_PROD_GLOBS[@]}" -N \ + 'env::var\("[A-Z_][A-Z0-9_]+"\)' "$dir" 2>/dev/null + rg "${RG_PROD_GLOBS[@]}" -N \ + 'env!\("[A-Z_][A-Z0-9_]+"\)' "$dir" 2>/dev/null + rg "${RG_PROD_GLOBS[@]}" -N \ + 'option_env!\("[A-Z_][A-Z0-9_]+"\)' "$dir" 2>/dev/null + } | grep -Ev '^[[:space:]]*(//|/\*|\*[[:space:]])' \ + | sed -nE 's|.*"([A-Z_][A-Z0-9_]+)".*|\1|p' +} + +# Count distinct env-var names read in a directory. +count_env_vars() { + _extract_env_var_names "$1" | sort -u | grep -c . +} + +# List distinct env-var names workspace-wide, with each name reported +# once. +list_env_vars_workspace() { + { + _extract_env_var_names crates + _extract_env_var_names scripts + } | sort -u +} + +# Count `include_bytes!` / `include_str!` / `include!` use sites. +count_includes() { + local dir="$1" + rg "${RG_PROD_GLOBS[@]}" --no-heading --no-filename --count-matches \ + '\b(include_bytes|include_str|include)!' "$dir" 2>/dev/null \ + | awk 'BEGIN{s=0} {s+=$1} END{print s+0}' +} + +# Extract a `build.rs` summary: LOC, target gate shape, `cargo:` +# directive count, env vars read. +build_rs_summary() { + local path="$1" + if [[ ! -f "$path" ]]; then + echo "absent" + return + fi + local loc cargo_dirs envs gates + loc=$(wc -l <"$path" | tr -d ' ') + cargo_dirs=$(grep -c '^[[:space:]]*println!("cargo:' "$path" 2>/dev/null || echo 0) + envs=$(rg -o 'env::var\("[A-Z_][A-Z0-9_]*"\)|env!\("[A-Z_][A-Z0-9_]*"\)' "$path" 2>/dev/null | wc -l | tr -d ' ') + if grep -qE 'target_(os|env|family|arch)' "$path"; then + gates="cfg-gated" + else + gates="unconditional" + fi + echo "${loc} LOC, ${cargo_dirs} cargo: directives, ${envs} env-var reads, ${gates}" +} + +# ── Markdown preamble ───────────────────────────────────────────────── +SHA="$(git rev-parse HEAD)" +DATE_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +cat </dev/null | sort -u | paste -sd ', ' - || echo "(none — unconditional)") +- **\`cargo:\` directives emitted:** +$(grep -E '^[[:space:]]*println!\("cargo:' "$path" 2>/dev/null \ + | sed -E 's|.*println!\("(cargo:[^"]*)".*| - `\1`|' \ + | sort -u || echo " - (none)") +- **Env vars read at build time:** +$({ rg -o 'env::var\("[A-Z_][A-Z0-9_]*"\)' "$path" 2>/dev/null \ + | sed -E 's|.*"([^"]+)".*| - `\1` (env::var)|' + rg -o 'env!\("[A-Z_][A-Z0-9_]*"\)' "$path" 2>/dev/null \ + | sed -E 's|.*"([^"]+)".*| - `\1` (env!)|'; } | sort -u || echo " - (none)") +- **\`#[allow]\` / \`#[expect]\` annotations:** +$(rg -n '#!\[(allow|expect)\(' "$path" 2>/dev/null \ + | sed -E 's|^([0-9]+):| - line \1: |' || echo " - (none)") + +EOF + fi + done +fi + +cat </phase_9_macro_audit_findings.md\` by Phase 9d. + +--- + +## §5 — Codegen binaries (workspace-internal) + +EOF + +if [[ ${#CODEGEN_BINS[@]} -eq 0 ]]; then + echo "_No codegen binaries found under \`scripts/ci/\` or \`scripts/ci-pipeline/\`._" +else + cat </dev/null | head -1) + count=$(rg "${RG_PROD_GLOBS[@]}" -l "\"$ev\"" crates scripts 2>/dev/null | wc -l | tr -d ' ') + printf "| \`%s\` | %d | \`%s\` |\n" "$ev" "$count" "${sample:-N/A}" + done +fi + +cat </dev/null \ + | sed -E 's|^([^:]+):([0-9]+):(.*)$|- `\1:\2` — `\3`|' \ + | head -40 +fi + +cat < \`cargo build --workspace --timings\` — only available when invoked +> with \`--with-cargo\`. Captures the timing HTML report under +> \`target/cargo-timings/cargo-timing-\$DATE-\$SHA.html\`. + +EOF + echo '```' + cargo build --workspace --timings 2>&1 | tail -10 + echo '```' + echo + echo "HTML report saved to \`target/cargo-timings/cargo-timing-*.html\`." +fi + +cat <