microsoft · davidhonig · May 12, 2026 · spboyer · May 22, 2026
@@ -833,7 +833,7 @@ cmd/waza/              CLI entrypoint and command definitions
   tokens/              Token counting subcommand
 internal/
   config/              Configuration with functional options
-  execution/           AgentEngine interface (mock, copilot)
+  execution/           AgentEngine interface (mock, copilot, codex)
   graders/             Validator registry and built-in graders
   metrics/             Scoring metrics
   models/              Data structures (EvalSpec, TestCase, EvaluationOutcome)
@@ -857,8 +857,9 @@ config:
   max_attempts: 3          # Retry failed graders up to 3 times (default: 1, no retries)
   timeout_seconds: 300
   parallel: false
-  executor: mock          # or copilot-sdk
+  executor: mock          # or copilot-sdk, codex
   model: claude-sonnet-4-20250514
+  model_reasoning_effort: high  # codex only; none, minimal, low, medium, high, xhigh
   group_by: model          # Group results by model (or other dimension)
 
 # Custom input variables available as {{.Vars.key}} in tasks and hooks
@@ -1149,6 +1150,8 @@ jobs:
 | **Go Version** | 1.26 or higher |
 | **Executor** | Use `mock` executor for CI (no API keys needed) |
 | **GitHub Token** | Only required for `copilot-sdk` executor: set `GITHUB_TOKEN` env var |
+| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` |
+| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` |
 | **Exit Codes** | 0=success, 1=test failure, 2=config error |
 
 #### Expected Skill Structure

@@ -322,6 +322,7 @@
 						Description("Choose how evals are executed").
 						Options(
 							huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"),
+							huh.NewOption("Codex — use ~/.codex config/auth", "codex"),
 							huh.NewOption("Mock — fast iteration, no API calls", "mock"),
 						).
 						Value(&engine),
@@ -333,7 +334,7 @@
 			}

 			// Model selector (hidden when engine ≠ copilot-sdk)
 			if engine == "copilot-sdk" {
 				modelForm := huh.NewForm(
 					huh.NewGroup(
 						huh.NewSelect[string]().
@@ -366,6 +367,10 @@
 				if err := modelForm.Run(); err != nil {
 					model = projectconfig.DefaultModel
 				}
+			} else if engine == "codex" {
+				// Let Codex read the default model from ~/.codex/config.toml unless
+				// the eval later sets config.model or the user passes --model.
+				model = ""
 			}
 
 			pathsForm := huh.NewForm(

@@ -451,6 +451,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
 	if err != nil {
 		return nil, fmt.Errorf("failed to load spec: %w", err)
 	}
+	if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil {
+		applyProjectDefaultsToEvalSpec(spec, cfg)
+	}
 
 	// CLI flags override spec config
 	if parallel {
@@ -572,9 +575,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
 	return allResults, nil
 }
 
+func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) {
+	if spec == nil || cfg == nil {
+		return
+	}
+
+	defaultEngine := cfg.Defaults.Engine
+	if defaultEngine == "" {
+		defaultEngine = projectconfig.DefaultEngine
+	}
+
+	engineWasDefault := spec.Config.EngineType == "" ||
+		(spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine)
+	if engineWasDefault {
+		spec.Config.EngineType = defaultEngine
+	}
+
+	defaultModel := cfg.Defaults.Model
+	modelWasDefault := spec.Config.ModelID == "" ||
+		(spec.Config.ModelID == projectconfig.DefaultModel &&
+			(defaultModel != projectconfig.DefaultModel || engineWasDefault))
+	if modelWasDefault {
+		spec.Config.ModelID = defaultModel
+	}
+	if spec.Config.ModelReasoningEffort == "" {
+		spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort
+	}
+}
+
+func displayModel(cfg models.Config) string {
+	if cfg.ModelID != "" {
+		return cfg.ModelID
+	}
+	if cfg.EngineType == "codex" {
+		return "default (Codex config)"
+	}
+	return ""
+}
+
 // runSingleModel executes a benchmark for one model and returns the outcome.
 // It prints the per-model summary and saves output for single-model runs.
 func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) {
+	if err := validateEngineFeatureSupport(spec); err != nil {
+		return nil, err
+	}
+
 	// Get spec directory for resolving relative paths
 	specDir := filepath.Dir(specPath)
 	if !filepath.IsAbs(specDir) {
@@ -641,6 +686,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 		engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{
 			NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests.
 		}).Build()
+	case "codex":
+		engine = execution.NewCodexEngine(spec.Config.ModelID)
 	default:
 		return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType)
 	}
@@ -735,7 +782,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 	fmt.Printf("Running benchmark: %s\n", spec.Name)
 	fmt.Printf("Skill: %s\n", spec.SkillName)
 	fmt.Printf("Engine: %s\n", spec.Config.EngineType)
-	fmt.Printf("Model: %s\n", spec.Config.ModelID)
+	fmt.Printf("Model: %s\n", displayModel(spec.Config))
 	if spec.Config.JudgeModel != "" {
 		fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel)
 	}
@@ -906,6 +953,18 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 	return outcome, nil
 }
 
+func validateEngineFeatureSupport(spec *models.EvalSpec) error {
+	if spec == nil || spec.Config.EngineType != "codex" {
+		return nil
+	}
+	for _, grader := range spec.Graders {
+		if grader.Kind == models.GraderKindSkillInvocation {
+			return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier)
+		}
+	}
+	return nil
+}
+
 // printModelComparison renders a comparison table for multi-model runs.
 func printModelComparison(results []modelResult) {
 	slices.SortFunc(results, func(a, b modelResult) int {