diff --git a/README.md b/README.md
index 9ef3973e7..05b0ce765 100644
--- a/README.md
+++ b/README.md
@@ -833,7 +833,7 @@ cmd/waza/ CLI entrypoint and command definitions
tokens/ Token counting subcommand
internal/
config/ Configuration with functional options
- execution/ AgentEngine interface (mock, copilot)
+ execution/ AgentEngine interface (mock, copilot, codex)
graders/ Validator registry and built-in graders
metrics/ Scoring metrics
models/ Data structures (EvalSpec, TestCase, EvaluationOutcome)
@@ -857,8 +857,9 @@ config:
max_attempts: 3 # Retry failed graders up to 3 times (default: 1, no retries)
timeout_seconds: 300
parallel: false
- executor: mock # or copilot-sdk
+ executor: mock # or copilot-sdk, codex
model: claude-sonnet-4-20250514
+ model_reasoning_effort: high # codex only; none, minimal, low, medium, high, xhigh
group_by: model # Group results by model (or other dimension)
# Custom input variables available as {{.Vars.key}} in tasks and hooks
@@ -1149,6 +1150,8 @@ jobs:
| **Go Version** | 1.26 or higher |
| **Executor** | Use `mock` executor for CI (no API keys needed) |
| **GitHub Token** | Only required for `copilot-sdk` executor: set `GITHUB_TOKEN` env var |
+| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` |
+| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` |
| **Exit Codes** | 0=success, 1=test failure, 2=config error |
#### Expected Skill Structure
diff --git a/cmd/waza/cmd_init.go b/cmd/waza/cmd_init.go
index 31ec04fda..ad236aa39 100644
--- a/cmd/waza/cmd_init.go
+++ b/cmd/waza/cmd_init.go
@@ -322,6 +322,7 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
Description("Choose how evals are executed").
Options(
huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"),
+ huh.NewOption("Codex — use ~/.codex config/auth", "codex"),
huh.NewOption("Mock — fast iteration, no API calls", "mock"),
).
Value(&engine),
@@ -366,6 +367,10 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
if err := modelForm.Run(); err != nil {
model = projectconfig.DefaultModel
}
+ } else if engine == "codex" {
+ // Let Codex read the default model from ~/.codex/config.toml unless
+ // the eval later sets config.model or the user passes --model.
+ model = ""
}
pathsForm := huh.NewForm(
diff --git a/cmd/waza/cmd_run.go b/cmd/waza/cmd_run.go
index c3ddd2a29..8eb8d28ed 100644
--- a/cmd/waza/cmd_run.go
+++ b/cmd/waza/cmd_run.go
@@ -451,6 +451,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
if err != nil {
return nil, fmt.Errorf("failed to load spec: %w", err)
}
+ if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil {
+ applyProjectDefaultsToEvalSpec(spec, cfg)
+ }
// CLI flags override spec config
if parallel {
@@ -572,9 +575,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
return allResults, nil
}
+func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) {
+ if spec == nil || cfg == nil {
+ return
+ }
+
+ defaultEngine := cfg.Defaults.Engine
+ if defaultEngine == "" {
+ defaultEngine = projectconfig.DefaultEngine
+ }
+
+ engineWasDefault := spec.Config.EngineType == "" ||
+ (spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine)
+ if engineWasDefault {
+ spec.Config.EngineType = defaultEngine
+ }
+
+ defaultModel := cfg.Defaults.Model
+ modelWasDefault := spec.Config.ModelID == "" ||
+ (spec.Config.ModelID == projectconfig.DefaultModel &&
+ (defaultModel != projectconfig.DefaultModel || engineWasDefault))
+ if modelWasDefault {
+ spec.Config.ModelID = defaultModel
+ }
+ if spec.Config.ModelReasoningEffort == "" {
+ spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort
+ }
+}
+
+func displayModel(cfg models.Config) string {
+ if cfg.ModelID != "" {
+ return cfg.ModelID
+ }
+ if cfg.EngineType == "codex" {
+ return "default (Codex config)"
+ }
+ return ""
+}
+
// runSingleModel executes a benchmark for one model and returns the outcome.
// It prints the per-model summary and saves output for single-model runs.
func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) {
+ if err := validateEngineFeatureSupport(spec); err != nil {
+ return nil, err
+ }
+
// Get spec directory for resolving relative paths
specDir := filepath.Dir(specPath)
if !filepath.IsAbs(specDir) {
@@ -641,6 +686,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{
NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests.
}).Build()
+ case "codex":
+ engine = execution.NewCodexEngine(spec.Config.ModelID)
default:
return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType)
}
@@ -735,7 +782,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
fmt.Printf("Running benchmark: %s\n", spec.Name)
fmt.Printf("Skill: %s\n", spec.SkillName)
fmt.Printf("Engine: %s\n", spec.Config.EngineType)
- fmt.Printf("Model: %s\n", spec.Config.ModelID)
+ fmt.Printf("Model: %s\n", displayModel(spec.Config))
if spec.Config.JudgeModel != "" {
fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel)
}
@@ -906,6 +953,18 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
return outcome, nil
}
+func validateEngineFeatureSupport(spec *models.EvalSpec) error {
+ if spec == nil || spec.Config.EngineType != "codex" {
+ return nil
+ }
+ for _, grader := range spec.Graders {
+ if grader.Kind == models.GraderKindSkillInvocation {
+ return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier)
+ }
+ }
+ return nil
+}
+
// printModelComparison renders a comparison table for multi-model runs.
func printModelComparison(results []modelResult) {
slices.SortFunc(results, func(a, b modelResult) int {
diff --git a/cmd/waza/cmd_run_test.go b/cmd/waza/cmd_run_test.go
index 11c7559b0..f0ee6d87a 100644
--- a/cmd/waza/cmd_run_test.go
+++ b/cmd/waza/cmd_run_test.go
@@ -11,6 +11,7 @@ import (
"maps"
"os"
"path/filepath"
+ "runtime"
"slices"
"sort"
"strings"
@@ -290,6 +291,309 @@ func TestRunCommand_MockEngineRun(t *testing.T) {
assert.NoError(t, err)
}
+func TestRunCommand_CodexEngineRun(t *testing.T) {
+ if runtime.GOOS == "windows" {
+ t.Skip("fake codex shell script is POSIX-only")
+ }
+ resetRunGlobals()
+ defer resetRunGlobals()
+
+ fakeDir := t.TempDir()
+ fakeCodex := filepath.Join(fakeDir, "codex")
+ fakeScript := `#!/bin/sh
+work=""
+out=""
+while [ "$#" -gt 0 ]; do
+ case "$1" in
+ --cd)
+ work="$2"
+ shift 2
+ ;;
+ --output-last-message)
+ out="$2"
+ shift 2
+ ;;
+ *)
+ shift
+ ;;
+ esac
+done
+if [ -n "$work" ]; then
+ cd "$work"
+fi
+cat > prompt.txt
+printf "codex command output" > generated.txt
+printf "codex final output" > "$out"
+`
+ require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755))
+ t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ specPath := createTestSpec(t, "codex")
+ outFile := filepath.Join(t.TempDir(), "results.json")
+
+ cmd := newRunCommand()
+ cmd.SetArgs([]string{specPath, "--output", outFile})
+ cmd.SetOut(io.Discard)
+ cmd.SetErr(io.Discard)
+
+ err := cmd.Execute()
+ require.NoError(t, err)
+
+ data, err := os.ReadFile(outFile)
+ require.NoError(t, err)
+ var result models.EvaluationOutcome
+ require.NoError(t, json.Unmarshal(data, &result))
+ assert.Equal(t, "codex", result.Setup.EngineType)
+ assert.Equal(t, "test-model", result.Setup.ModelID)
+ require.Len(t, result.TestOutcomes, 1)
+ require.Len(t, result.TestOutcomes[0].Runs, 1)
+ assert.Equal(t, "codex final output", result.TestOutcomes[0].Runs[0].FinalOutput)
+}
+
+func TestRunCommand_WazaYamlCodexOverridesScaffoldedCopilotDefault(t *testing.T) {
+ if runtime.GOOS == "windows" {
+ t.Skip("fake codex shell script is POSIX-only")
+ }
+ resetRunGlobals()
+ defer resetRunGlobals()
+
+ fakeDir := t.TempDir()
+ fakeCodex := filepath.Join(fakeDir, "codex")
+ fakeScript := `#!/bin/sh
+work=""
+out=""
+args=""
+while [ "$#" -gt 0 ]; do
+ case "$1" in
+ --cd)
+ work="$2"
+ args="$args $1 $2"
+ shift 2
+ ;;
+ --output-last-message)
+ out="$2"
+ args="$args $1 $2"
+ shift 2
+ ;;
+ *)
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+if [ -n "$work" ]; then
+ cd "$work"
+fi
+cat > prompt.txt
+printf "%s" "$args" > args.txt
+if [ -n "${WAZA_FAKE_CODEX_ARGS:-}" ]; then
+ printf "%s" "$args" > "$WAZA_FAKE_CODEX_ARGS"
+fi
+printf "codex final output" > "$out"
+`
+ require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755))
+ t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH"))
+ argsPath := filepath.Join(t.TempDir(), "args.txt")
+ t.Setenv("WAZA_FAKE_CODEX_ARGS", argsPath)
+
+ dir := t.TempDir()
+ require.NoError(t, os.WriteFile(filepath.Join(dir, ".waza.yaml"), []byte("defaults:\n engine: codex\n"), 0o644))
+
+ taskDir := filepath.Join(dir, "tasks")
+ require.NoError(t, os.MkdirAll(taskDir, 0o755))
+ task := `id: default-engine-task
+name: Default Engine Task
+inputs:
+ prompt: "Hello"
+`
+ require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(task), 0o644))
+
+ spec := `name: default-engine-test
+skill: cfg-skill
+version: "1.0"
+config:
+ trials_per_task: 1
+ timeout_seconds: 10
+ executor: copilot-sdk
+ model: claude-sonnet-4.6
+tasks:
+ - "tasks/*.yaml"
+`
+ specPath := filepath.Join(dir, "eval.yaml")
+ require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644))
+
+ outFile := filepath.Join(t.TempDir(), "results.json")
+ cmd := newRunCommand()
+ cmd.SetArgs([]string{specPath, "--output", outFile})
+ cmd.SetOut(io.Discard)
+ cmd.SetErr(io.Discard)
+
+ err := cmd.Execute()
+ require.NoError(t, err)
+
+ data, err := os.ReadFile(outFile)
+ require.NoError(t, err)
+ var result models.EvaluationOutcome
+ require.NoError(t, json.Unmarshal(data, &result))
+ assert.Equal(t, "codex", result.Setup.EngineType)
+ assert.Equal(t, "", result.Setup.ModelID, "codex should use ~/.codex/config.toml when default model is omitted")
+ assert.Equal(t, "codex final output", result.TestOutcomes[0].Runs[0].FinalOutput)
+
+ argsData, err := os.ReadFile(argsPath)
+ require.NoError(t, err)
+ assert.NotContains(t, string(argsData), "--model")
+}
+
+func TestRunCommand_WazaYamlCodexConfigModelAndReasoningEffort(t *testing.T) {
+ if runtime.GOOS == "windows" {
+ t.Skip("fake codex shell script is POSIX-only")
+ }
+ resetRunGlobals()
+ defer resetRunGlobals()
+
+ fakeDir := t.TempDir()
+ fakeCodex := filepath.Join(fakeDir, "codex")
+ fakeScript := `#!/bin/sh
+work=""
+out=""
+args=""
+while [ "$#" -gt 0 ]; do
+ case "$1" in
+ --cd)
+ work="$2"
+ args="$args $1 $2"
+ shift 2
+ ;;
+ --output-last-message)
+ out="$2"
+ args="$args $1 $2"
+ shift 2
+ ;;
+ *)
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+if [ -n "$work" ]; then
+ cd "$work"
+fi
+cat > prompt.txt
+printf "%s" "$args" > args.txt
+if [ -n "${WAZA_FAKE_CODEX_ARGS:-}" ]; then
+ printf "%s" "$args" > "$WAZA_FAKE_CODEX_ARGS"
+fi
+printf "codex final output" > "$out"
+`
+ require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755))
+ t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH"))
+ argsPath := filepath.Join(t.TempDir(), "args.txt")
+ t.Setenv("WAZA_FAKE_CODEX_ARGS", argsPath)
+
+ dir := t.TempDir()
+ require.NoError(t, os.WriteFile(filepath.Join(dir, ".waza.yaml"), []byte(`defaults:
+ engine: codex
+ model: gpt-4o
+ model_reasoning_effort: high
+`), 0o644))
+
+ taskDir := filepath.Join(dir, "tasks")
+ require.NoError(t, os.MkdirAll(taskDir, 0o755))
+ task := `id: config-model-task
+name: Config Model Task
+inputs:
+ prompt: "Hello"
+`
+ require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(task), 0o644))
+
+ spec := `name: config-model-test
+skill: cfg-skill
+version: "1.0"
+config:
+ trials_per_task: 1
+ timeout_seconds: 10
+ executor: copilot-sdk
+ model: claude-sonnet-4.6
+tasks:
+ - "tasks/*.yaml"
+`
+ specPath := filepath.Join(dir, "eval.yaml")
+ require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644))
+
+ outFile := filepath.Join(t.TempDir(), "results.json")
+ cmd := newRunCommand()
+ cmd.SetArgs([]string{specPath, "--output", outFile})
+ cmd.SetOut(io.Discard)
+ cmd.SetErr(io.Discard)
+
+ err := cmd.Execute()
+ require.NoError(t, err)
+
+ data, err := os.ReadFile(outFile)
+ require.NoError(t, err)
+ var result models.EvaluationOutcome
+ require.NoError(t, json.Unmarshal(data, &result))
+ assert.Equal(t, "codex", result.Setup.EngineType)
+ assert.Equal(t, "gpt-4o", result.Setup.ModelID)
+
+ argsData, err := os.ReadFile(argsPath)
+ require.NoError(t, err)
+ args := string(argsData)
+ assert.Contains(t, args, "--model gpt-4o")
+ assert.Contains(t, args, `model_reasoning_effort="high"`)
+}
+
+func TestDisplayModelShowsCodexConfigDefault(t *testing.T) {
+ assert.Equal(t, "default (Codex config)", displayModel(models.Config{EngineType: "codex"}))
+ assert.Equal(t, "gpt-4o", displayModel(models.Config{
+ EngineType: "codex",
+ ModelID: "gpt-4o",
+ }))
+}
+
+func TestRunCommand_CodexRejectsSkillInvocationGrader(t *testing.T) {
+ resetRunGlobals()
+ defer resetRunGlobals()
+
+ dir := t.TempDir()
+ taskDir := filepath.Join(dir, "tasks")
+ require.NoError(t, os.MkdirAll(taskDir, 0o755))
+ require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(`id: skill-telemetry-task
+name: Skill Telemetry Task
+inputs:
+ prompt: "Hello"
+`), 0o644))
+
+ spec := `name: skill-telemetry-test
+skill: cfg-skill
+version: "1.0"
+config:
+ trials_per_task: 1
+ timeout_seconds: 10
+ executor: codex
+graders:
+ - type: skill_invocation
+ name: required_skill
+ config:
+ required_skills:
+ - cfg-skill
+tasks:
+ - "tasks/*.yaml"
+`
+ specPath := filepath.Join(dir, "eval.yaml")
+ require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644))
+
+ cmd := newRunCommand()
+ cmd.SetArgs([]string{specPath})
+ cmd.SetOut(io.Discard)
+ cmd.SetErr(io.Discard)
+
+ err := cmd.Execute()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "skill_invocation")
+ assert.Contains(t, err.Error(), "not supported by the codex executor")
+}
+
func TestRunCommand_MockEngineVerbose(t *testing.T) {
resetRunGlobals()
diff --git a/internal/execution/codex.go b/internal/execution/codex.go
new file mode 100644
index 000000000..4914d7f56
--- /dev/null
+++ b/internal/execution/codex.go
@@ -0,0 +1,513 @@
+package execution
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ copilot "github.com/github/copilot-sdk/go"
+ "github.com/microsoft/waza/internal/models"
+)
+
+// CodexEngine executes tasks through the local Codex CLI.
+//
+// The Codex CLI owns its own configuration and authentication discovery, so this
+// engine intentionally does not parse ~/.codex/config.toml or auth.json. It
+// invokes `codex exec` in Waza's isolated workspace and lets Codex load its
+// normal config/auth state.
+type CodexEngine struct {
+ defaultModelID string
+ binary string
+ binaryPath string
+
+ workspacesMu sync.Mutex
+ workspaces []string
+ keepWorkspace bool
+
+ initCalled atomic.Bool
+}
+
+// CodexEngineOption configures a CodexEngine.
+type CodexEngineOption func(*CodexEngine)
+
+// WithCodexBinary overrides the Codex executable path. It is mainly useful for
+// tests and for users who keep Codex outside PATH.
+func WithCodexBinary(path string) CodexEngineOption {
+ return func(e *CodexEngine) {
+ if path != "" {
+ e.binary = path
+ }
+ }
+}
+
+// NewCodexEngine creates a Codex-backed execution engine.
+func NewCodexEngine(defaultModelID string, opts ...CodexEngineOption) *CodexEngine {
+ e := &CodexEngine{
+ defaultModelID: defaultModelID,
+ binary: "codex",
+ }
+ for _, opt := range opts {
+ opt(e)
+ }
+ return e
+}
+
+// SetKeepWorkspace enables or disables workspace preservation on shutdown.
+func (e *CodexEngine) SetKeepWorkspace(keep bool) {
+ e.keepWorkspace = keep
+}
+
+// Initialize verifies that the Codex CLI can be found. Codex itself handles
+// config/auth loading when the first task is executed.
+func (e *CodexEngine) Initialize(ctx context.Context) error {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ default:
+ }
+
+ path, err := exec.LookPath(e.binary)
+ if err != nil {
+ return fmt.Errorf("codex executable %q not found in PATH: %w", e.binary, err)
+ }
+ e.binaryPath = path
+ e.initCalled.Store(true)
+ return nil
+}
+
+// Execute runs a test prompt with `codex exec`.
+func (e *CodexEngine) Execute(ctx context.Context, req *ExecutionRequest) (*ExecutionResponse, error) {
+ if req == nil {
+ return nil, fmt.Errorf("nil req was passed to CodexEngine.Execute")
+ }
+ if !e.initCalled.Load() {
+ return nil, fmt.Errorf("engine was not initialized. Initialize needs to be called before Execute")
+ }
+ if req.Timeout <= 0 {
+ return nil, fmt.Errorf("positive Timeout is required")
+ }
+
+ modelID := e.defaultModelID
+ if req.ModelID != "" {
+ modelID = req.ModelID
+ }
+
+ sourceDir := req.SourceDir
+ if sourceDir == "" {
+ cwd, err := os.Getwd()
+ if err != nil {
+ return nil, fmt.Errorf("failed to get current directory: %w", err)
+ }
+ sourceDir = cwd
+ }
+
+ start := time.Now()
+
+ workspaceDir := req.WorkspaceDir
+ if workspaceDir == "" {
+ tmpDir, err := os.MkdirTemp("", "waza-codex-*")
+ if err != nil {
+ return nil, fmt.Errorf("failed to create codex workspace: %w", err)
+ }
+ workspaceDir = tmpDir
+ e.trackWorkspace(workspaceDir)
+
+ if err := setupWorkspaceResources(workspaceDir, req.Resources); err != nil {
+ return nil, fmt.Errorf("failed to setup codex workspace resources: %w", err)
+ }
+ }
+
+ if _, hasDeadline := ctx.Deadline(); !hasDeadline {
+ var cancel context.CancelFunc
+ ctx, cancel = context.WithTimeout(ctx, req.Timeout)
+ defer cancel()
+ }
+
+ outputFile, err := os.CreateTemp("", "waza-codex-output-*.txt")
+ if err != nil {
+ return nil, fmt.Errorf("failed to create codex output file: %w", err)
+ }
+ outputPath := outputFile.Name()
+ _ = outputFile.Close()
+ defer os.Remove(outputPath) //nolint:errcheck
+
+ if req.CancelOnSkillInvocation {
+ return nil, fmt.Errorf("codex engine does not support skill invocation telemetry required by trigger tests")
+ }
+
+ args := e.buildArgs(req, modelID, workspaceDir, outputPath)
+
+ prompt := e.buildPrompt(sourceDir, req)
+ cmd := exec.CommandContext(ctx, e.binaryPath, args...)
+ cmd.Dir = workspaceDir
+ cmd.Env = os.Environ()
+ cmd.Stdin = strings.NewReader(prompt)
+
+ var stdout, stderr bytes.Buffer
+ cmd.Stdout = &stdout
+ cmd.Stderr = &stderr
+
+ runErr := cmd.Run()
+ telemetry := parseCodexJSONEvents(stdout.String())
+ finalOutput := readCodexOutput(outputPath, telemetry.FinalOutput())
+
+ errMsg := ""
+ success := true
+ if runErr != nil {
+ success = false
+ errMsg = strings.TrimSpace(stderr.String())
+ if errMsg == "" {
+ errMsg = runErr.Error()
+ } else {
+ errMsg = fmt.Sprintf("%s: %v", errMsg, runErr)
+ }
+ }
+
+ sessionID := telemetry.SessionID
+ if sessionID == "" {
+ sessionID = req.SessionID
+ }
+ if sessionID == "" {
+ sessionID = fmt.Sprintf("codex-session-%d", time.Now().UnixNano())
+ }
+
+ return &ExecutionResponse{
+ FinalOutput: finalOutput,
+ Events: telemetry.Events,
+ ModelID: modelID,
+ DurationMs: time.Since(start).Milliseconds(),
+ ToolCalls: models.FilterToolCalls(telemetry.Events),
+ ErrorMsg: errMsg,
+ Success: success,
+ WorkspaceDir: workspaceDir,
+ WorkspaceFiles: captureWorkspaceFiles(workspaceDir),
+ SessionID: sessionID,
+ Usage: telemetry.Usage,
+ }, nil
+}
+
+func (e *CodexEngine) buildArgs(req *ExecutionRequest, modelID, workspaceDir, outputPath string) []string {
+ common := []string{
+ "-c", `approval_policy="never"`,
+ "--skip-git-repo-check",
+ "--output-last-message", outputPath,
+ }
+ if modelID != "" {
+ common = append(common, "--model", modelID)
+ }
+ if req.ModelReasoningEffort != "" {
+ common = append(common, "-c", fmt.Sprintf("model_reasoning_effort=%q", req.ModelReasoningEffort))
+ }
+
+ if req.SessionID != "" {
+ args := []string{
+ "exec",
+ "resume",
+ "--json",
+ "-c", `sandbox_mode="workspace-write"`,
+ }
+ args = append(args, common...)
+ args = append(args, req.SessionID, "-")
+ return args
+ }
+
+ args := []string{
+ "exec",
+ "--json",
+ "--cd", workspaceDir,
+ "--sandbox", "workspace-write",
+ "--color", "never",
+ }
+ args = append(args, common...)
+ args = append(args, "-")
+ return args
+}
+
+// Shutdown removes Codex workspaces created by this engine.
+func (e *CodexEngine) Shutdown(ctx context.Context) error {
+ workspaces := func() []string {
+ e.workspacesMu.Lock()
+ defer e.workspacesMu.Unlock()
+ ws := e.workspaces
+ e.workspaces = nil
+ return ws
+ }()
+
+ for _, ws := range workspaces {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ default:
+ }
+ if ws == "" {
+ continue
+ }
+ if e.keepWorkspace {
+ fmt.Fprintf(os.Stderr, "Workspace preserved: %s\n", ws)
+ continue
+ }
+ if err := os.RemoveAll(ws); err != nil {
+ return fmt.Errorf("failed to remove codex workspace %s: %w", ws, err)
+ }
+ }
+ return nil
+}
+
+// SessionUsage returns nil because codex exec does not currently expose Waza's
+// Copilot-style session usage digest.
+func (e *CodexEngine) SessionUsage(sessionID string) *models.UsageStats {
+ return nil
+}
+
+func (e *CodexEngine) trackWorkspace(path string) {
+ e.workspacesMu.Lock()
+ defer e.workspacesMu.Unlock()
+ e.workspaces = append(e.workspaces, path)
+}
+
+func (e *CodexEngine) buildPrompt(sourceDir string, req *ExecutionRequest) string {
+ var sb strings.Builder
+
+ if !req.NoSkills {
+ skillDirs := skillDirsForRequest(sourceDir, req)
+ if msg := buildSkillSystemMessage(skillDirs, req.SkillName); msg != "" {
+ sb.WriteString(msg)
+ sb.WriteString("\n")
+ }
+ }
+
+ if req.TaskName != "" || req.TaskDescription != "" || len(req.Context) > 0 {
+ sb.WriteString("\n")
+ if req.TaskName != "" {
+ fmt.Fprintf(&sb, "Name: %s\n", req.TaskName)
+ }
+ if req.TaskDescription != "" {
+ fmt.Fprintf(&sb, "Description: %s\n", req.TaskDescription)
+ }
+ if len(req.Context) > 0 {
+ sb.WriteString("Metadata:\n")
+ for k, v := range req.Context {
+ fmt.Fprintf(&sb, "- %s: %v\n", k, v)
+ }
+ }
+ sb.WriteString("\n\n")
+ }
+
+ sb.WriteString(req.Message)
+ return sb.String()
+}
+
+func readCodexOutput(outputPath, stdout string) string {
+ data, err := os.ReadFile(outputPath)
+ if err == nil && len(data) > 0 {
+ return string(data)
+ }
+ return stdout
+}
+
+type codexTelemetry struct {
+ SessionID string
+ Events []copilot.SessionEvent
+ Usage *models.UsageStats
+}
+
+func (t codexTelemetry) FinalOutput() string {
+ for i := len(t.Events) - 1; i >= 0; i-- {
+ evt := t.Events[i]
+ if evt.Type == copilot.AssistantMessage && evt.Data.Content != nil {
+ return *evt.Data.Content
+ }
+ }
+ return ""
+}
+
+type codexJSONEvent struct {
+ Type string `json:"type"`
+ ThreadID string `json:"thread_id"`
+ Item codexJSONItem `json:"item"`
+ Usage codexJSONUsage `json:"usage"`
+}
+
+type codexJSONItem struct {
+ ID string `json:"id"`
+ Type string `json:"type"`
+ Text string `json:"text"`
+ Name string `json:"name"`
+ ToolName string `json:"tool_name"`
+ Command string `json:"command"`
+ AggregatedOutput string `json:"aggregated_output"`
+ Output string `json:"output"`
+ Status string `json:"status"`
+ ExitCode *int `json:"exit_code"`
+ Arguments any `json:"arguments"`
+ Changes []codexJSONFileChange `json:"changes"`
+ Extra map[string]interface{} `json:"-"`
+}
+
+type codexJSONFileChange struct {
+ Path string `json:"path"`
+ Kind string `json:"kind"`
+}
+
+type codexJSONUsage struct {
+ InputTokens int `json:"input_tokens"`
+ CachedInputTokens int `json:"cached_input_tokens"`
+ OutputTokens int `json:"output_tokens"`
+ ReasoningOutputTokens int `json:"reasoning_output_tokens"`
+}
+
+func parseCodexJSONEvents(stdout string) codexTelemetry {
+ var telemetry codexTelemetry
+ for _, line := range strings.Split(stdout, "\n") {
+ line = strings.TrimSpace(line)
+ if line == "" || !strings.HasPrefix(line, "{") {
+ continue
+ }
+
+ var event codexJSONEvent
+ if err := json.Unmarshal([]byte(line), &event); err != nil {
+ continue
+ }
+
+ switch event.Type {
+ case "thread.started":
+ telemetry.SessionID = event.ThreadID
+ case "item.started":
+ if evt, ok := codexItemToSessionEvent(event.Item, false); ok {
+ telemetry.Events = append(telemetry.Events, evt)
+ }
+ case "item.completed":
+ if event.Item.Type == "agent_message" {
+ if event.Item.Text != "" {
+ text := event.Item.Text
+ telemetry.Events = append(telemetry.Events, copilot.SessionEvent{
+ Type: copilot.AssistantMessage,
+ Data: copilot.Data{Content: &text},
+ })
+ }
+ continue
+ }
+ if evt, ok := codexItemToSessionEvent(event.Item, true); ok {
+ telemetry.Events = append(telemetry.Events, evt)
+ }
+ case "turn.completed":
+ usage := &models.UsageStats{
+ Turns: 1,
+ InputTokens: event.Usage.InputTokens,
+ OutputTokens: event.Usage.OutputTokens,
+ CacheReadTokens: event.Usage.CachedInputTokens,
+ }
+ if !usage.IsZero() {
+ telemetry.Usage = usage
+ }
+ }
+ }
+ return telemetry
+}
+
+func codexItemToSessionEvent(item codexJSONItem, completed bool) (copilot.SessionEvent, bool) {
+ toolName, args, resultText, ok := codexToolFields(item)
+ if !ok {
+ return copilot.SessionEvent{}, false
+ }
+
+ toolCallID := item.ID
+ if toolCallID == "" {
+ toolCallID = fmt.Sprintf("%s-%s", item.Type, toolName)
+ }
+
+ if !completed {
+ return copilot.SessionEvent{
+ Type: copilot.ToolExecutionStart,
+ Data: copilot.Data{
+ ToolCallID: &toolCallID,
+ ToolName: &toolName,
+ Arguments: args,
+ },
+ }, true
+ }
+
+ success := item.Status != "failed"
+ if item.ExitCode != nil && *item.ExitCode != 0 {
+ success = false
+ }
+ return copilot.SessionEvent{
+ Type: copilot.ToolExecutionComplete,
+ Data: copilot.Data{
+ ToolCallID: &toolCallID,
+ ToolName: &toolName,
+ Success: &success,
+ Result: &copilot.Result{
+ Content: &resultText,
+ },
+ },
+ }, true
+}
+
+func codexToolFields(item codexJSONItem) (string, any, string, bool) {
+ switch item.Type {
+ case "command_execution":
+ return "bash", map[string]any{"command": item.Command}, item.AggregatedOutput, true
+ case "file_change":
+ path := ""
+ kind := ""
+ if len(item.Changes) > 0 {
+ path = item.Changes[0].Path
+ kind = item.Changes[0].Kind
+ }
+ return "edit", map[string]any{"path": path, "command": kind}, item.Status, true
+ }
+
+ if strings.Contains(item.Type, "tool") {
+ name := item.Name
+ if name == "" {
+ name = item.ToolName
+ }
+ if name == "" {
+ name = item.Type
+ }
+ result := item.Output
+ if result == "" {
+ result = item.AggregatedOutput
+ }
+ if result == "" {
+ result = item.Status
+ }
+ return name, item.Arguments, result, true
+ }
+
+ return "", nil, "", false
+}
+
+func skillDirsForRequest(cwd string, req *ExecutionRequest) []string {
+ skillDirs := []string{cwd}
+ seen := map[string]bool{cwd: true}
+
+ for _, path := range req.SkillPaths {
+ if !seen[path] {
+ seen[path] = true
+ skillDirs = append(skillDirs, path)
+ }
+ }
+
+ return cleanSkillDirs(skillDirs)
+}
+
+func cleanSkillDirs(paths []string) []string {
+ cleaned := make([]string, 0, len(paths))
+ for _, path := range paths {
+ if path == "" {
+ continue
+ }
+ cleaned = append(cleaned, filepath.Clean(path))
+ }
+ return cleaned
+}
diff --git a/internal/execution/codex_test.go b/internal/execution/codex_test.go
new file mode 100644
index 000000000..52f862b5e
--- /dev/null
+++ b/internal/execution/codex_test.go
@@ -0,0 +1,195 @@
+package execution
+
+import (
+ "context"
+ "os"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestCodexEngineExecuteUsesCLIWorkspaceAndSkillContext(t *testing.T) {
+ if runtime.GOOS == "windows" {
+ t.Skip("fake codex shell script is POSIX-only")
+ }
+
+ fakeCodex := writeFakeCodex(t, 0)
+ sourceDir := t.TempDir()
+ skillDir := filepath.Join(sourceDir, "skills", "demo")
+ require.NoError(t, os.MkdirAll(skillDir, 0o755))
+ require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte("---\nname: demo\n---\nAlways mention workspace facts."), 0o644))
+
+ engine := NewCodexEngine("test-model", WithCodexBinary(fakeCodex))
+ require.NoError(t, engine.Initialize(context.Background()))
+ defer func() {
+ require.NoError(t, engine.Shutdown(context.Background()))
+ }()
+
+ resp, err := engine.Execute(context.Background(), &ExecutionRequest{
+ Message: "Inspect the fixture.",
+ ModelReasoningEffort: "high",
+ Resources: []ResourceFile{{Path: "input.txt", Content: []byte("fixture data")}},
+ SourceDir: sourceDir,
+ SkillName: "demo",
+ TaskName: "Codex task",
+ TaskDescription: "Verify fake execution.",
+ SkillPaths: []string{filepath.Join(sourceDir, "skills")},
+ Timeout: 10 * time.Second,
+ })
+
+ require.NoError(t, err)
+ require.True(t, resp.Success)
+ require.Equal(t, "final from fake codex", resp.FinalOutput)
+ require.Equal(t, "test-model", resp.ModelID)
+ require.Contains(t, resp.WorkspaceFiles, "created.txt")
+ require.Equal(t, []byte("fixture data"), resp.WorkspaceFiles["input.txt"])
+
+ prompt := string(resp.WorkspaceFiles["prompt.txt"])
+ require.Contains(t, prompt, "")
+ require.Contains(t, prompt, "Always mention workspace facts.")
+ require.Contains(t, prompt, "Name: Codex task")
+
+ args := string(resp.WorkspaceFiles["args.txt"])
+ require.Contains(t, args, "--model test-model")
+ require.Contains(t, args, `approval_policy="never"`)
+ require.Contains(t, args, `model_reasoning_effort="high"`)
+ require.Contains(t, args, "--sandbox workspace-write")
+ require.NotContains(t, args, "--ephemeral")
+ require.Len(t, resp.ToolCalls, 1)
+ require.Equal(t, "bash", resp.ToolCalls[0].Name)
+ require.Equal(t, "codex-test-session", resp.SessionID)
+ require.NotNil(t, resp.Usage)
+ require.Equal(t, 12, resp.Usage.InputTokens)
+}
+
+func TestCodexEngineExecuteReportsCLIError(t *testing.T) {
+ if runtime.GOOS == "windows" {
+ t.Skip("fake codex shell script is POSIX-only")
+ }
+
+ fakeCodex := writeFakeCodex(t, 7)
+ engine := NewCodexEngine("", WithCodexBinary(fakeCodex))
+ require.NoError(t, engine.Initialize(context.Background()))
+ defer func() {
+ require.NoError(t, engine.Shutdown(context.Background()))
+ }()
+
+ resp, err := engine.Execute(context.Background(), &ExecutionRequest{
+ Message: "fail",
+ Timeout: 10 * time.Second,
+ })
+
+ require.NoError(t, err)
+ require.False(t, resp.Success)
+ require.Contains(t, resp.ErrorMsg, "fake codex failed")
+ require.Equal(t, "final from fake codex", resp.FinalOutput)
+}
+
+func TestCodexEngineExecuteResumesSessionForFollowUp(t *testing.T) {
+ if runtime.GOOS == "windows" {
+ t.Skip("fake codex shell script is POSIX-only")
+ }
+
+ fakeCodex := writeFakeCodex(t, 0)
+ engine := NewCodexEngine("", WithCodexBinary(fakeCodex))
+ require.NoError(t, engine.Initialize(context.Background()))
+ defer func() {
+ require.NoError(t, engine.Shutdown(context.Background()))
+ }()
+
+ first, err := engine.Execute(context.Background(), &ExecutionRequest{
+ Message: "Remember apple.",
+ Timeout: 10 * time.Second,
+ })
+ require.NoError(t, err)
+ require.Equal(t, "codex-test-session", first.SessionID)
+
+ second, err := engine.Execute(context.Background(), &ExecutionRequest{
+ Message: "What did I ask you to remember?",
+ SessionID: first.SessionID,
+ WorkspaceDir: first.WorkspaceDir,
+ Timeout: 10 * time.Second,
+ })
+ require.NoError(t, err)
+
+ args := string(second.WorkspaceFiles["args.txt"])
+ require.Contains(t, args, "exec resume")
+ require.Contains(t, args, "codex-test-session")
+ require.NotContains(t, args, "--ephemeral")
+ require.Equal(t, first.WorkspaceDir, second.WorkspaceDir)
+}
+
+func TestCodexEngineExecuteRejectsSkillTriggerTelemetry(t *testing.T) {
+ fakeCodex := writeFakeCodex(t, 0)
+ engine := NewCodexEngine("", WithCodexBinary(fakeCodex))
+ require.NoError(t, engine.Initialize(context.Background()))
+ defer func() {
+ require.NoError(t, engine.Shutdown(context.Background()))
+ }()
+
+ _, err := engine.Execute(context.Background(), &ExecutionRequest{
+ Message: "trigger?",
+ Timeout: 10 * time.Second,
+ CancelOnSkillInvocation: true,
+ })
+ require.ErrorContains(t, err, "does not support skill invocation telemetry")
+}
+
+func writeFakeCodex(t *testing.T, exitCode int) string {
+ t.Helper()
+
+ dir := t.TempDir()
+ path := filepath.Join(dir, "codex")
+ script := `#!/bin/sh
+set -u
+work=""
+out=""
+args=""
+while [ "$#" -gt 0 ]; do
+ case "$1" in
+ --cd)
+ work="$2"
+ args="$args $1 $2"
+ shift 2
+ ;;
+ --output-last-message)
+ out="$2"
+ args="$args $1 $2"
+ shift 2
+ ;;
+ *)
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+if [ -n "$work" ]; then
+ cd "$work"
+fi
+cat > prompt.txt
+printf "%s" "$args" > args.txt
+printf "created by fake codex" > created.txt
+if [ -n "$out" ]; then
+ printf "final from fake codex" > "$out"
+else
+ printf "final from fake codex"
+fi
+cat <<'JSON'
+{"type":"thread.started","thread_id":"codex-test-session"}
+{"type":"item.started","item":{"id":"item_0","type":"command_execution","command":"/bin/sh -c pwd","aggregated_output":"","exit_code":null,"status":"in_progress"}}
+{"type":"item.completed","item":{"id":"item_0","type":"command_execution","command":"/bin/sh -c pwd","aggregated_output":"fake pwd\n","exit_code":0,"status":"completed"}}
+{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"final from fake codex"}}
+{"type":"turn.completed","usage":{"input_tokens":12,"cached_input_tokens":3,"output_tokens":4,"reasoning_output_tokens":1}}
+JSON
+if [ ` + strconv.Itoa(exitCode) + ` -ne 0 ]; then
+ printf "fake codex failed\n" >&2
+ exit ` + strconv.Itoa(exitCode) + `
+fi
+`
+ require.NoError(t, os.WriteFile(path, []byte(script), 0o755))
+ return path
+}
diff --git a/internal/execution/engine.go b/internal/execution/engine.go
index acbd870e5..907e7d54a 100644
--- a/internal/execution/engine.go
+++ b/internal/execution/engine.go
@@ -36,10 +36,11 @@ type WorkspaceKeeper interface {
// ExecutionRequest represents a test execution request
type ExecutionRequest struct {
- ModelID string
- Message string
- Context map[string]any
- Resources []ResourceFile
+ ModelID string
+ ModelReasoningEffort string
+ Message string
+ Context map[string]any
+ Resources []ResourceFile
SessionID string
WorkspaceDir string // Reuse an existing workspace directory (for follow-up prompts)
diff --git a/internal/models/spec.go b/internal/models/spec.go
index 6f8c33545..de423bfb7 100644
--- a/internal/models/spec.go
+++ b/internal/models/spec.go
@@ -35,20 +35,21 @@ type SpecIdentity struct {
// Config controls execution behavior
type Config struct {
- TrialsPerTask int `yaml:"trials_per_task" json:"runs_per_test"`
- TimeoutSec int `yaml:"timeout_seconds" json:"timeout_sec"`
- Concurrent bool `yaml:"parallel" json:"concurrent"`
- Workers int `yaml:"workers,omitempty" json:"workers,omitempty"`
- StopOnError bool `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
- EngineType string `yaml:"executor" json:"engine_type"`
- ModelID string `yaml:"model" json:"model_id"`
- SkillPaths []string `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
- DisabledSkills []string `yaml:"disabled_skills,omitempty" json:"disabled_skills,omitempty"`
- RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
- ServerConfigs map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
- MaxAttempts int `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
- GroupBy string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
- JudgeModel string `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
+ TrialsPerTask int `yaml:"trials_per_task" json:"runs_per_test"`
+ TimeoutSec int `yaml:"timeout_seconds" json:"timeout_sec"`
+ Concurrent bool `yaml:"parallel" json:"concurrent"`
+ Workers int `yaml:"workers,omitempty" json:"workers,omitempty"`
+ StopOnError bool `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
+ EngineType string `yaml:"executor" json:"engine_type"`
+ ModelID string `yaml:"model" json:"model_id"`
+ ModelReasoningEffort string `yaml:"model_reasoning_effort,omitempty" json:"model_reasoning_effort,omitempty"`
+ SkillPaths []string `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
+ DisabledSkills []string `yaml:"disabled_skills,omitempty" json:"disabled_skills,omitempty"`
+ RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
+ ServerConfigs map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
+ MaxAttempts int `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
+ GroupBy string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
+ JudgeModel string `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
}
// GraderConfig defines a validator/grader
diff --git a/internal/models/spec_test.go b/internal/models/spec_test.go
index db0a437ba..25ce0e0f6 100644
--- a/internal/models/spec_test.go
+++ b/internal/models/spec_test.go
@@ -364,6 +364,32 @@ config:
})
}
+func TestEvalSpec_ModelReasoningEffort(t *testing.T) {
+ tempDir := t.TempDir()
+ yamlContent := `name: reasoning-test
+skill: test
+config:
+ trials_per_task: 1
+ timeout_seconds: 60
+ executor: codex
+ model: gpt-4o
+ model_reasoning_effort: high
+`
+ specPath := filepath.Join(tempDir, "reasoning.yaml")
+ if err := os.WriteFile(specPath, []byte(yamlContent), 0644); err != nil {
+ t.Fatalf("Failed to write spec file: %v", err)
+ }
+
+ spec, err := LoadEvalSpec(specPath)
+ if err != nil {
+ t.Fatalf("Failed to load spec: %v", err)
+ }
+
+ if spec.Config.ModelReasoningEffort != "high" {
+ t.Errorf("Expected model_reasoning_effort='high', got '%s'", spec.Config.ModelReasoningEffort)
+ }
+}
+
func TestConfig_AllSkillsDisabled(t *testing.T) {
tests := []struct {
name string
diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go
index f9eaf731a..6d3ecb159 100644
--- a/internal/orchestration/runner.go
+++ b/internal/orchestration/runner.go
@@ -1158,16 +1158,18 @@ func (r *EvalRunner) buildExecutionRequest(tc *models.TestCase) *execution.Execu
noSkills := spec.Config.AllSkillsDisabled()
return &execution.ExecutionRequest{
- Message: tc.Stimulus.Message,
- Context: tc.Stimulus.Metadata,
- Resources: resources,
- SkillName: spec.SkillName,
- TaskName: tc.DisplayName,
- TaskDescription: tc.Summary,
- SkillPaths: resolvedSkillPaths,
- NoSkills: noSkills,
- Timeout: time.Duration(timeout) * time.Second,
- MCPServers: convertMCPServers(spec.Config.ServerConfigs),
+ ModelID: spec.Config.ModelID,
+ ModelReasoningEffort: spec.Config.ModelReasoningEffort,
+ Message: tc.Stimulus.Message,
+ Context: tc.Stimulus.Metadata,
+ Resources: resources,
+ SkillName: spec.SkillName,
+ TaskName: tc.DisplayName,
+ TaskDescription: tc.Summary,
+ SkillPaths: resolvedSkillPaths,
+ NoSkills: noSkills,
+ Timeout: time.Duration(timeout) * time.Second,
+ MCPServers: convertMCPServers(spec.Config.ServerConfigs),
}
}
diff --git a/internal/projectconfig/config.go b/internal/projectconfig/config.go
index cac235e6e..7ec5fb579 100644
--- a/internal/projectconfig/config.go
+++ b/internal/projectconfig/config.go
@@ -56,14 +56,15 @@ type PathsConfig struct {
// DefaultsConfig holds default execution parameters.
type DefaultsConfig struct {
- Engine string `yaml:"engine,omitempty"`
- Model string `yaml:"model,omitempty"`
- JudgeModel string `yaml:"judgeModel,omitempty"`
- Timeout int `yaml:"timeout,omitempty"`
- Parallel *bool `yaml:"parallel,omitempty"`
- Workers int `yaml:"workers,omitempty"`
- Verbose *bool `yaml:"verbose,omitempty"`
- SessionLog *bool `yaml:"sessionLog,omitempty"`
+ Engine string `yaml:"engine,omitempty"`
+ Model string `yaml:"model,omitempty"`
+ ModelReasoningEffort string `yaml:"model_reasoning_effort,omitempty"`
+ JudgeModel string `yaml:"judgeModel,omitempty"`
+ Timeout int `yaml:"timeout,omitempty"`
+ Parallel *bool `yaml:"parallel,omitempty"`
+ Workers int `yaml:"workers,omitempty"`
+ Verbose *bool `yaml:"verbose,omitempty"`
+ SessionLog *bool `yaml:"sessionLog,omitempty"`
}
// CacheConfig holds cache settings.
@@ -146,14 +147,15 @@ func New() *ProjectConfig {
Results: DefaultResultsDir,
},
Defaults: DefaultsConfig{
- Engine: DefaultEngine,
- Model: DefaultModel,
- JudgeModel: "",
- Timeout: DefaultTimeout,
- Parallel: boolPtr(false),
- Workers: DefaultWorkers,
- Verbose: boolPtr(false),
- SessionLog: boolPtr(false),
+ Engine: DefaultEngine,
+ Model: DefaultModel,
+ ModelReasoningEffort: "",
+ JudgeModel: "",
+ Timeout: DefaultTimeout,
+ Parallel: boolPtr(false),
+ Workers: DefaultWorkers,
+ Verbose: boolPtr(false),
+ SessionLog: boolPtr(false),
},
Cache: CacheConfig{
Enabled: boolPtr(false),
@@ -206,9 +208,17 @@ func Load(startDir string) (*ProjectConfig, error) {
if err := decoder.Decode(&fileCfg); err != nil {
return nil, fmt.Errorf("parsing .waza.yaml: %w", err)
}
+ defaultsModelSet := hasDefaultsField(data, "model")
// Merge file values onto defaults.
mergeConfig(cfg, &fileCfg)
+ if defaultsModelSet {
+ // An explicit empty model is meaningful for engines such as codex, where
+ // the underlying tool can read its default model from its own config.
+ cfg.Defaults.Model = fileCfg.Defaults.Model
+ } else if fileCfg.Defaults.Engine == "codex" {
+ cfg.Defaults.Model = ""
+ }
return cfg, nil
}
@@ -262,6 +272,9 @@ func mergeConfig(dst, src *ProjectConfig) {
if src.Defaults.Model != "" {
dst.Defaults.Model = src.Defaults.Model
}
+ if src.Defaults.ModelReasoningEffort != "" {
+ dst.Defaults.ModelReasoningEffort = src.Defaults.ModelReasoningEffort
+ }
if src.Defaults.JudgeModel != "" {
dst.Defaults.JudgeModel = src.Defaults.JudgeModel
}
@@ -340,3 +353,26 @@ func mergeConfig(dst, src *ProjectConfig) {
func boolPtr(b bool) *bool {
return &b
}
+
+func hasDefaultsField(data []byte, field string) bool {
+ var root yaml.Node
+ if err := yaml.Unmarshal(data, &root); err != nil {
+ return false
+ }
+ if len(root.Content) == 0 || root.Content[0].Kind != yaml.MappingNode {
+ return false
+ }
+ top := root.Content[0]
+ for i := 0; i+1 < len(top.Content); i += 2 {
+ if top.Content[i].Value != "defaults" || top.Content[i+1].Kind != yaml.MappingNode {
+ continue
+ }
+ defaults := top.Content[i+1]
+ for j := 0; j+1 < len(defaults.Content); j += 2 {
+ if defaults.Content[j].Value == field {
+ return true
+ }
+ }
+ }
+ return false
+}
diff --git a/internal/projectconfig/config_test.go b/internal/projectconfig/config_test.go
index 47b497990..04b19d4bc 100644
--- a/internal/projectconfig/config_test.go
+++ b/internal/projectconfig/config_test.go
@@ -17,6 +17,7 @@ func TestNew_ReturnsAllDefaults(t *testing.T) {
// Defaults
assertEqual(t, "Defaults.Engine", "copilot-sdk", cfg.Defaults.Engine)
assertEqual(t, "Defaults.Model", "claude-sonnet-4.6", cfg.Defaults.Model)
+ assertEqual(t, "Defaults.ModelReasoningEffort", "", cfg.Defaults.ModelReasoningEffort)
assertEqual(t, "Defaults.JudgeModel", "", cfg.Defaults.JudgeModel)
assertEqualInt(t, "Defaults.Timeout", 300, cfg.Defaults.Timeout)
assertBoolPtr(t, "Defaults.Parallel", false, cfg.Defaults.Parallel)
@@ -58,6 +59,7 @@ paths:
defaults:
engine: mock
model: gpt-4o
+ model_reasoning_effort: high
judgeModel: claude-sonnet-4.6
timeout: 600
parallel: true
@@ -96,6 +98,7 @@ graders:
assertEqual(t, "Paths.Results", "custom-results/", cfg.Paths.Results)
assertEqual(t, "Defaults.Engine", "mock", cfg.Defaults.Engine)
assertEqual(t, "Defaults.Model", "gpt-4o", cfg.Defaults.Model)
+ assertEqual(t, "Defaults.ModelReasoningEffort", "high", cfg.Defaults.ModelReasoningEffort)
assertEqual(t, "Defaults.JudgeModel", "claude-sonnet-4.6", cfg.Defaults.JudgeModel)
assertEqualInt(t, "Defaults.Timeout", 600, cfg.Defaults.Timeout)
assertBoolPtr(t, "Defaults.Parallel", true, cfg.Defaults.Parallel)
@@ -148,6 +151,39 @@ defaults:
assertEqualInt(t, "Graders.ProgramTimeout", 30, cfg.Graders.ProgramTimeout)
}
+func TestLoad_CodexDefaultsAllowConfigModel(t *testing.T) {
+ dir := t.TempDir()
+ writeFile(t, dir, ".waza.yaml", `
+defaults:
+ engine: codex
+`)
+
+ cfg, err := Load(dir)
+ if err != nil {
+ t.Fatalf("Load() error: %v", err)
+ }
+
+ assertEqual(t, "Defaults.Engine", "codex", cfg.Defaults.Engine)
+ assertEqual(t, "Defaults.Model", "", cfg.Defaults.Model)
+}
+
+func TestLoad_ExplicitEmptyModelOverridesDefault(t *testing.T) {
+ dir := t.TempDir()
+ writeFile(t, dir, ".waza.yaml", `
+defaults:
+ engine: codex
+ model: ""
+`)
+
+ cfg, err := Load(dir)
+ if err != nil {
+ t.Fatalf("Load() error: %v", err)
+ }
+
+ assertEqual(t, "Defaults.Engine", "codex", cfg.Defaults.Engine)
+ assertEqual(t, "Defaults.Model", "", cfg.Defaults.Model)
+}
+
func TestLoad_MissingFile_ReturnsDefaults(t *testing.T) {
dir := t.TempDir()
diff --git a/internal/projectconfig/schema_parity_test.go b/internal/projectconfig/schema_parity_test.go
index 7487df360..69961ec5b 100644
--- a/internal/projectconfig/schema_parity_test.go
+++ b/internal/projectconfig/schema_parity_test.go
@@ -65,6 +65,7 @@ func TestSchemaDefaultsMatchGoDefaults(t *testing.T) {
// --- defaults ---
assertStringDefault(t, getDefault("defaults", "engine"), cfg.Defaults.Engine, "defaults.engine")
assertStringDefault(t, getDefault("defaults", "model"), cfg.Defaults.Model, "defaults.model")
+ assertStringDefault(t, getDefault("defaults", "model_reasoning_effort"), cfg.Defaults.ModelReasoningEffort, "defaults.model_reasoning_effort")
assertIntDefault(t, getDefault("defaults", "timeout"), cfg.Defaults.Timeout, "defaults.timeout")
assertBoolDefault(t, getDefault("defaults", "parallel"), *cfg.Defaults.Parallel, "defaults.parallel")
assertIntDefault(t, getDefault("defaults", "workers"), cfg.Defaults.Workers, "defaults.workers")
diff --git a/internal/scaffold/scaffold.go b/internal/scaffold/scaffold.go
index 838c4e23a..ca2d0f5f5 100644
--- a/internal/scaffold/scaffold.go
+++ b/internal/scaffold/scaffold.go
@@ -44,7 +44,8 @@ func TitleCase(s string) string {
}
// ReadProjectDefaults reads engine and model from .waza.yaml if it exists.
-// Falls back to copilot-sdk and claude-sonnet-4.6.
+// Falls back to copilot-sdk and claude-sonnet-4.6. Codex projects may return
+// an empty model so the Codex CLI can use ~/.codex/config.toml.
func ReadProjectDefaults() (engine, model string) {
dir, err := os.Getwd()
if err != nil {
@@ -59,6 +60,11 @@ func ReadProjectDefaults() (engine, model string) {
// EvalYAML returns a default eval.yaml template for the given skill name.
func EvalYAML(name, engine, model string) string {
+ modelLine := ""
+ if model != "" {
+ modelLine = fmt.Sprintf(" model: %s\n", model)
+ }
+
return fmt.Sprintf(`name: %s-eval
description: Evaluation suite for %s.
skill: %s
@@ -68,7 +74,7 @@ config:
timeout_seconds: 300
parallel: false
executor: %s
- model: %s
+%s
metrics:
- name: task_completion
weight: 1.0
@@ -87,7 +93,7 @@ graders:
- "(?i)(explain|describe|analyze|implement)"
tasks:
- "tasks/*.yaml"
-`, name, name, name, engine, model)
+`, name, name, name, engine, modelLine)
}
// TaskFiles returns a map of task filename to content.
diff --git a/internal/scaffold/scaffold_test.go b/internal/scaffold/scaffold_test.go
index 651651c81..4b89eaea0 100644
--- a/internal/scaffold/scaffold_test.go
+++ b/internal/scaffold/scaffold_test.go
@@ -81,6 +81,13 @@ func TestEvalYAML_CustomEngine(t *testing.T) {
assert.Contains(t, content, "model: gpt-4o")
}
+func TestEvalYAML_OmitsEmptyModel(t *testing.T) {
+ content := EvalYAML("my-skill", "codex", "")
+
+ assert.Contains(t, content, "executor: codex")
+ assert.NotContains(t, content, "model:")
+}
+
func TestTaskFiles(t *testing.T) {
tasks := TaskFiles("my-skill")
diff --git a/internal/validation/schema_test.go b/internal/validation/schema_test.go
index 874b05f7e..f22cd64e9 100644
--- a/internal/validation/schema_test.go
+++ b/internal/validation/schema_test.go
@@ -56,6 +56,26 @@ func TestValidateEvalBytes_Valid(t *testing.T) {
require.Empty(t, errs, "valid eval should have no errors")
}
+func TestValidateEvalBytes_CodexModelOptional(t *testing.T) {
+ yaml := `name: test-eval
+description: Test evaluation
+skill: test-skill
+version: "1.0"
+config:
+ trials_per_task: 1
+ timeout_seconds: 60
+ executor: codex
+metrics:
+ - name: accuracy
+ weight: 1.0
+ threshold: 0.8
+tasks:
+ - "tasks/*.yaml"
+`
+ errs := ValidateEvalBytes([]byte(yaml))
+ require.Empty(t, errs, "codex eval should allow model to come from ~/.codex/config.toml")
+}
+
func TestValidateEvalBytes_Invalid(t *testing.T) {
errs := ValidateEvalBytes([]byte(invalidEvalYAML))
require.NotEmpty(t, errs, "invalid eval should have errors")
diff --git a/schemas/config.schema.json b/schemas/config.schema.json
index a413eee22..45cb270e1 100644
--- a/schemas/config.schema.json
+++ b/schemas/config.schema.json
@@ -31,19 +31,25 @@
"type": "object",
"description": "Default values applied to evaluations. Used as fallbacks by 'waza run' and as defaults for 'waza new'.",
"properties": {
- "engine": {
- "type": "string",
- "description": "Execution engine for evaluations.",
- "enum": ["copilot-sdk", "mock"],
- "default": "copilot-sdk"
- },
- "model": {
- "type": "string",
- "description": "Default model identifier for evaluations.",
- "examples": ["claude-sonnet-4.6", "gpt-4o", "claude-sonnet-4"],
- "default": "claude-sonnet-4.6"
- },
- "judgeModel": {
+ "engine": {
+ "type": "string",
+ "description": "Execution engine for evaluations. Use 'codex' to run through the local Codex CLI and its ~/.codex config/auth.",
+ "enum": ["copilot-sdk", "codex", "mock"],
+ "default": "copilot-sdk"
+ },
+ "model": {
+ "type": "string",
+ "description": "Default model identifier for evaluations. Optional for the 'codex' engine when ~/.codex/config.toml provides the model.",
+ "examples": ["claude-sonnet-4.6", "gpt-4o", "claude-sonnet-4"],
+ "default": "claude-sonnet-4.6"
+ },
+ "model_reasoning_effort": {
+ "type": "string",
+ "description": "Codex model reasoning effort passed as a Codex config override. Valid values depend on the Codex CLI/model; commonly: none, minimal, low, medium, high, xhigh.",
+ "enum": ["none", "minimal", "low", "medium", "high", "xhigh"],
+ "default": ""
+ },
+ "judgeModel": {
"type": "string",
"description": "Model used for LLM-as-judge grading. If omitted, uses the same model as the evaluation.",
"examples": ["claude-sonnet-4.6", "gpt-4o"]
diff --git a/schemas/eval.schema.json b/schemas/eval.schema.json
index b5b355242..98b675743 100644
--- a/schemas/eval.schema.json
+++ b/schemas/eval.schema.json
@@ -95,8 +95,7 @@
"required": [
"trials_per_task",
"timeout_seconds",
- "executor",
- "model"
+ "executor"
],
"additionalProperties": false,
"description": "Execution configuration for the evaluation.",
@@ -130,20 +129,33 @@
"type": "string",
"enum": [
"copilot-sdk",
+ "codex",
"mock"
],
- "description": "Execution engine to use. 'copilot-sdk' for real evaluations, 'mock' for testing."
+ "description": "Execution engine to use. 'copilot-sdk' for GitHub Copilot SDK evaluations, 'codex' for local Codex CLI evaluations, 'mock' for testing."
},
"model": {
"type": "string",
"minLength": 1,
- "description": "Default model identifier for evaluations.",
+ "description": "Default model identifier for evaluations. Optional for the 'codex' executor when ~/.codex/config.toml provides the model.",
"examples": [
"gpt-4o",
"claude-sonnet-4-20250514",
"gpt-4o-mini"
]
},
+ "model_reasoning_effort": {
+ "type": "string",
+ "enum": [
+ "none",
+ "minimal",
+ "low",
+ "medium",
+ "high",
+ "xhigh"
+ ],
+ "description": "Codex model reasoning effort passed as a Codex config override."
+ },
"max_attempts": {
"type": "integer",
"minimum": 1,
diff --git a/site/src/content/docs/guides/eval-yaml.mdx b/site/src/content/docs/guides/eval-yaml.mdx
index e9b1bfa5d..3aa6b1464 100644
--- a/site/src/content/docs/guides/eval-yaml.mdx
+++ b/site/src/content/docs/guides/eval-yaml.mdx
@@ -97,8 +97,9 @@ config:
parallel: false # Run tasks sequentially (true = concurrent)
workers: 4 # Parallel workers if parallel: true
model: claude-sonnet-4.6 # Default model (override with --model)
+ model_reasoning_effort: high # Codex-only reasoning effort override
judge_model: gpt-4o # Model for LLM-as-judge graders (optional)
- executor: mock # mock (local) or copilot-sdk (real API)
+ executor: mock # mock (local), copilot-sdk (GitHub Copilot), or codex (local Codex CLI)
```
| Field | Type | Default | Description |
@@ -107,9 +108,10 @@ config:
| `timeout_seconds` | int | 300 | Task timeout in seconds |
| `parallel` | bool | false | Run tasks concurrently |
| `workers` | int | 4 | Number of parallel workers |
-| `model` | string | _required_ | Default model for tasks (override with `--model` flag) |
+| `model` | string | _required_ | Default model for tasks (override with `--model` flag). Optional for `codex` when `~/.codex/config.toml` provides the model. |
+| `model_reasoning_effort` | string | — | Codex reasoning effort passed via `-c model_reasoning_effort=...`; commonly `none`, `minimal`, `low`, `medium`, `high`, or `xhigh` |
| `judge_model` | string | (same as `model`) | Model for `prompt`-type graders (LLM-as-judge) |
-| `executor` | string | `copilot-sdk` | Executor: `mock` (local, echoes task metadata and file content) or `copilot-sdk` (real API) |
+| `executor` | string | `copilot-sdk` | Executor: `mock` (local, echoes task metadata and file content), `copilot-sdk` (GitHub Copilot SDK), or `codex` (local Codex CLI using `~/.codex` config/auth). |
| `max_attempts` | int | 0 | Maximum retry attempts per task on failure (0 = no retries) |
| `group_by` | string | — | Group results by a field (e.g., `tags`, `task_id`) |
| `fail_fast` | bool | false | Stop the entire run on first task failure |
diff --git a/site/src/content/docs/reference/schema.mdx b/site/src/content/docs/reference/schema.mdx
index 612d0312e..586fa6550 100644
--- a/site/src/content/docs/reference/schema.mdx
+++ b/site/src/content/docs/reference/schema.mdx
@@ -145,18 +145,31 @@ config:
**Type:** string
**Default:** (empty)
-Default LLM model. Override with `--model` flag.
+Default LLM model. Override with `--model` flag. Optional for `codex` when `~/.codex/config.toml` provides the model.
```yaml
config:
model: claude-sonnet-4.6
```
+### model_reasoning_effort
+
+**Type:** string
+**Default:** (empty)
+
+Codex reasoning effort passed through as `-c model_reasoning_effort=...`. Common values are `none`, `minimal`, `low`, `medium`, `high`, and `xhigh`.
+
+```yaml
+config:
+ executor: codex
+ model_reasoning_effort: high
+```
+
### executor
**Type:** string
**Default:** mock
-**Options:** `mock`, `copilot-sdk`
+**Options:** `mock`, `copilot-sdk`, `codex`
Execution engine:
@@ -609,6 +622,8 @@ paths:
# Model defaults
defaults:
+ engine: codex
+ model_reasoning_effort: high
model: claude-sonnet-4.6
timeout: 300
workers: 4
diff --git a/site/src/content/docs/reference/waza-yaml.mdx b/site/src/content/docs/reference/waza-yaml.mdx
index 57f408313..896e89aaa 100644
--- a/site/src/content/docs/reference/waza-yaml.mdx
+++ b/site/src/content/docs/reference/waza-yaml.mdx
@@ -15,7 +15,9 @@ paths:
results: results/
defaults:
- model: claude-sonnet-4.6
+ engine: codex
+ model: gpt-4o
+ model_reasoning_effort: high
timeout: 300
workers: 4
```
@@ -38,8 +40,9 @@ Default execution parameters applied to all commands unless overridden by CLI fl
| Field | Type | Default | Description |
|-------|------|---------|-------------|
-| `engine` | string | `copilot-sdk` | Execution engine |
-| `model` | string | `claude-sonnet-4.6` | Default model for execution |
+| `engine` | string | `copilot-sdk` | Execution engine: `copilot-sdk`, `codex`, or `mock` |
+| `model` | string | `claude-sonnet-4.6` | Default model for execution. Optional for `codex` when `~/.codex/config.toml` provides it |
+| `model_reasoning_effort` | string | | Codex reasoning effort passed via `-c model_reasoning_effort=...`; commonly `none`, `minimal`, `low`, `medium`, `high`, or `xhigh` |
| `judgeModel` | string | | Model for LLM-as-judge graders |
| `timeout` | int | `300` | Task timeout in seconds |
| `parallel` | bool | `false` | Enable parallel execution |