diff --git a/README.md b/README.md
index 9ef3973e7..05b0ce765 100644
--- a/README.md
+++ b/README.md
@@ -833,7 +833,7 @@ cmd/waza/              CLI entrypoint and command definitions
   tokens/              Token counting subcommand
 internal/
   config/              Configuration with functional options
-  execution/           AgentEngine interface (mock, copilot)
+  execution/           AgentEngine interface (mock, copilot, codex)
   graders/             Validator registry and built-in graders
   metrics/             Scoring metrics
   models/              Data structures (EvalSpec, TestCase, EvaluationOutcome)
@@ -857,8 +857,9 @@ config:
   max_attempts: 3          # Retry failed graders up to 3 times (default: 1, no retries)
   timeout_seconds: 300
   parallel: false
-  executor: mock          # or copilot-sdk
+  executor: mock          # or copilot-sdk, codex
   model: claude-sonnet-4-20250514
+  model_reasoning_effort: high  # codex only; none, minimal, low, medium, high, xhigh
   group_by: model          # Group results by model (or other dimension)
 
 # Custom input variables available as {{.Vars.key}} in tasks and hooks
@@ -1149,6 +1150,8 @@ jobs:
 | **Go Version** | 1.26 or higher |
 | **Executor** | Use `mock` executor for CI (no API keys needed) |
 | **GitHub Token** | Only required for `copilot-sdk` executor: set `GITHUB_TOKEN` env var |
+| **Codex Auth** | Only required for `codex` executor: uses the local Codex CLI config/auth from `~/.codex` |
+| **Codex Reasoning** | Optional `model_reasoning_effort` uses Codex's config key; common values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh` |
 | **Exit Codes** | 0=success, 1=test failure, 2=config error |
 
 #### Expected Skill Structure
diff --git a/cmd/waza/cmd_init.go b/cmd/waza/cmd_init.go
index 31ec04fda..ad236aa39 100644
--- a/cmd/waza/cmd_init.go
+++ b/cmd/waza/cmd_init.go
@@ -322,6 +322,7 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
 						Description("Choose how evals are executed").
 						Options(
 							huh.NewOption("Copilot SDK — real model execution", "copilot-sdk"),
+							huh.NewOption("Codex — use ~/.codex config/auth", "codex"),
 							huh.NewOption("Mock — fast iteration, no API calls", "mock"),
 						).
 						Value(&engine),
@@ -366,6 +367,10 @@ func initCommandE(cmd *cobra.Command, args []string, noSkill bool, flagSkillsDir
 				if err := modelForm.Run(); err != nil {
 					model = projectconfig.DefaultModel
 				}
+			} else if engine == "codex" {
+				// Let Codex read the default model from ~/.codex/config.toml unless
+				// the eval later sets config.model or the user passes --model.
+				model = ""
 			}
 
 			pathsForm := huh.NewForm(
diff --git a/cmd/waza/cmd_run.go b/cmd/waza/cmd_run.go
index c3ddd2a29..8eb8d28ed 100644
--- a/cmd/waza/cmd_run.go
+++ b/cmd/waza/cmd_run.go
@@ -451,6 +451,9 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
 	if err != nil {
 		return nil, fmt.Errorf("failed to load spec: %w", err)
 	}
+	if cfg, cfgErr := projectconfig.Load(filepath.Dir(specPath)); cfgErr == nil {
+		applyProjectDefaultsToEvalSpec(spec, cfg)
+	}
 
 	// CLI flags override spec config
 	if parallel {
@@ -572,9 +575,51 @@ func runCommandForSpec(cmd *cobra.Command, sp skillSpecPath, defaultSkills []str
 	return allResults, nil
 }
 
+func applyProjectDefaultsToEvalSpec(spec *models.EvalSpec, cfg *projectconfig.ProjectConfig) {
+	if spec == nil || cfg == nil {
+		return
+	}
+
+	defaultEngine := cfg.Defaults.Engine
+	if defaultEngine == "" {
+		defaultEngine = projectconfig.DefaultEngine
+	}
+
+	engineWasDefault := spec.Config.EngineType == "" ||
+		(spec.Config.EngineType == projectconfig.DefaultEngine && defaultEngine != projectconfig.DefaultEngine)
+	if engineWasDefault {
+		spec.Config.EngineType = defaultEngine
+	}
+
+	defaultModel := cfg.Defaults.Model
+	modelWasDefault := spec.Config.ModelID == "" ||
+		(spec.Config.ModelID == projectconfig.DefaultModel &&
+			(defaultModel != projectconfig.DefaultModel || engineWasDefault))
+	if modelWasDefault {
+		spec.Config.ModelID = defaultModel
+	}
+	if spec.Config.ModelReasoningEffort == "" {
+		spec.Config.ModelReasoningEffort = cfg.Defaults.ModelReasoningEffort
+	}
+}
+
+func displayModel(cfg models.Config) string {
+	if cfg.ModelID != "" {
+		return cfg.ModelID
+	}
+	if cfg.EngineType == "codex" {
+		return "default (Codex config)"
+	}
+	return ""
+}
+
 // runSingleModel executes a benchmark for one model and returns the outcome.
 // It prints the per-model summary and saves output for single-model runs.
 func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string, defaultSkills []string) (*models.EvaluationOutcome, error) {
+	if err := validateEngineFeatureSupport(spec); err != nil {
+		return nil, err
+	}
+
 	// Get spec directory for resolving relative paths
 	specDir := filepath.Dir(specPath)
 	if !filepath.IsAbs(specDir) {
@@ -641,6 +686,8 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 		engine = execution.NewCopilotEngineBuilder(spec.Config.ModelID, &execution.CopilotEngineBuilderOptions{
 			NewCopilotClient: newCopilotClientFn, // if nil, uses the real function, otherwise overridable for tests.
 		}).Build()
+	case "codex":
+		engine = execution.NewCodexEngine(spec.Config.ModelID)
 	default:
 		return nil, fmt.Errorf("unknown engine type: %s", spec.Config.EngineType)
 	}
@@ -735,7 +782,7 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 	fmt.Printf("Running benchmark: %s\n", spec.Name)
 	fmt.Printf("Skill: %s\n", spec.SkillName)
 	fmt.Printf("Engine: %s\n", spec.Config.EngineType)
-	fmt.Printf("Model: %s\n", spec.Config.ModelID)
+	fmt.Printf("Model: %s\n", displayModel(spec.Config))
 	if spec.Config.JudgeModel != "" {
 		fmt.Printf("Judge Model: %s\n", spec.Config.JudgeModel)
 	}
@@ -906,6 +953,18 @@ func runSingleModel(cmd *cobra.Command, spec *models.EvalSpec, specPath string,
 	return outcome, nil
 }
 
+func validateEngineFeatureSupport(spec *models.EvalSpec) error {
+	if spec == nil || spec.Config.EngineType != "codex" {
+		return nil
+	}
+	for _, grader := range spec.Graders {
+		if grader.Kind == models.GraderKindSkillInvocation {
+			return fmt.Errorf("grader %q uses skill_invocation, which is not supported by the codex executor because Codex CLI does not emit skill invocation telemetry", grader.Identifier)
+		}
+	}
+	return nil
+}
+
 // printModelComparison renders a comparison table for multi-model runs.
 func printModelComparison(results []modelResult) {
 	slices.SortFunc(results, func(a, b modelResult) int {
diff --git a/cmd/waza/cmd_run_test.go b/cmd/waza/cmd_run_test.go
index 11c7559b0..f0ee6d87a 100644
--- a/cmd/waza/cmd_run_test.go
+++ b/cmd/waza/cmd_run_test.go
@@ -11,6 +11,7 @@ import (
 	"maps"
 	"os"
 	"path/filepath"
+	"runtime"
 	"slices"
 	"sort"
 	"strings"
@@ -290,6 +291,309 @@ func TestRunCommand_MockEngineRun(t *testing.T) {
 	assert.NoError(t, err)
 }
 
+func TestRunCommand_CodexEngineRun(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("fake codex shell script is POSIX-only")
+	}
+	resetRunGlobals()
+	defer resetRunGlobals()
+
+	fakeDir := t.TempDir()
+	fakeCodex := filepath.Join(fakeDir, "codex")
+	fakeScript := `#!/bin/sh
+work=""
+out=""
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --cd)
+      work="$2"
+      shift 2
+      ;;
+    --output-last-message)
+      out="$2"
+      shift 2
+      ;;
+    *)
+      shift
+      ;;
+  esac
+done
+if [ -n "$work" ]; then
+  cd "$work"
+fi
+cat > prompt.txt
+printf "codex command output" > generated.txt
+printf "codex final output" > "$out"
+`
+	require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755))
+	t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+	specPath := createTestSpec(t, "codex")
+	outFile := filepath.Join(t.TempDir(), "results.json")
+
+	cmd := newRunCommand()
+	cmd.SetArgs([]string{specPath, "--output", outFile})
+	cmd.SetOut(io.Discard)
+	cmd.SetErr(io.Discard)
+
+	err := cmd.Execute()
+	require.NoError(t, err)
+
+	data, err := os.ReadFile(outFile)
+	require.NoError(t, err)
+	var result models.EvaluationOutcome
+	require.NoError(t, json.Unmarshal(data, &result))
+	assert.Equal(t, "codex", result.Setup.EngineType)
+	assert.Equal(t, "test-model", result.Setup.ModelID)
+	require.Len(t, result.TestOutcomes, 1)
+	require.Len(t, result.TestOutcomes[0].Runs, 1)
+	assert.Equal(t, "codex final output", result.TestOutcomes[0].Runs[0].FinalOutput)
+}
+
+func TestRunCommand_WazaYamlCodexOverridesScaffoldedCopilotDefault(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("fake codex shell script is POSIX-only")
+	}
+	resetRunGlobals()
+	defer resetRunGlobals()
+
+	fakeDir := t.TempDir()
+	fakeCodex := filepath.Join(fakeDir, "codex")
+	fakeScript := `#!/bin/sh
+work=""
+out=""
+args=""
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --cd)
+      work="$2"
+      args="$args $1 $2"
+      shift 2
+      ;;
+    --output-last-message)
+      out="$2"
+      args="$args $1 $2"
+      shift 2
+      ;;
+    *)
+      args="$args $1"
+      shift
+      ;;
+  esac
+done
+if [ -n "$work" ]; then
+  cd "$work"
+fi
+cat > prompt.txt
+printf "%s" "$args" > args.txt
+if [ -n "${WAZA_FAKE_CODEX_ARGS:-}" ]; then
+  printf "%s" "$args" > "$WAZA_FAKE_CODEX_ARGS"
+fi
+printf "codex final output" > "$out"
+`
+	require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755))
+	t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH"))
+	argsPath := filepath.Join(t.TempDir(), "args.txt")
+	t.Setenv("WAZA_FAKE_CODEX_ARGS", argsPath)
+
+	dir := t.TempDir()
+	require.NoError(t, os.WriteFile(filepath.Join(dir, ".waza.yaml"), []byte("defaults:\n  engine: codex\n"), 0o644))
+
+	taskDir := filepath.Join(dir, "tasks")
+	require.NoError(t, os.MkdirAll(taskDir, 0o755))
+	task := `id: default-engine-task
+name: Default Engine Task
+inputs:
+  prompt: "Hello"
+`
+	require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(task), 0o644))
+
+	spec := `name: default-engine-test
+skill: cfg-skill
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 10
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+tasks:
+  - "tasks/*.yaml"
+`
+	specPath := filepath.Join(dir, "eval.yaml")
+	require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644))
+
+	outFile := filepath.Join(t.TempDir(), "results.json")
+	cmd := newRunCommand()
+	cmd.SetArgs([]string{specPath, "--output", outFile})
+	cmd.SetOut(io.Discard)
+	cmd.SetErr(io.Discard)
+
+	err := cmd.Execute()
+	require.NoError(t, err)
+
+	data, err := os.ReadFile(outFile)
+	require.NoError(t, err)
+	var result models.EvaluationOutcome
+	require.NoError(t, json.Unmarshal(data, &result))
+	assert.Equal(t, "codex", result.Setup.EngineType)
+	assert.Equal(t, "", result.Setup.ModelID, "codex should use ~/.codex/config.toml when default model is omitted")
+	assert.Equal(t, "codex final output", result.TestOutcomes[0].Runs[0].FinalOutput)
+
+	argsData, err := os.ReadFile(argsPath)
+	require.NoError(t, err)
+	assert.NotContains(t, string(argsData), "--model")
+}
+
+func TestRunCommand_WazaYamlCodexConfigModelAndReasoningEffort(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("fake codex shell script is POSIX-only")
+	}
+	resetRunGlobals()
+	defer resetRunGlobals()
+
+	fakeDir := t.TempDir()
+	fakeCodex := filepath.Join(fakeDir, "codex")
+	fakeScript := `#!/bin/sh
+work=""
+out=""
+args=""
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --cd)
+      work="$2"
+      args="$args $1 $2"
+      shift 2
+      ;;
+    --output-last-message)
+      out="$2"
+      args="$args $1 $2"
+      shift 2
+      ;;
+    *)
+      args="$args $1"
+      shift
+      ;;
+  esac
+done
+if [ -n "$work" ]; then
+  cd "$work"
+fi
+cat > prompt.txt
+printf "%s" "$args" > args.txt
+if [ -n "${WAZA_FAKE_CODEX_ARGS:-}" ]; then
+  printf "%s" "$args" > "$WAZA_FAKE_CODEX_ARGS"
+fi
+printf "codex final output" > "$out"
+`
+	require.NoError(t, os.WriteFile(fakeCodex, []byte(fakeScript), 0o755))
+	t.Setenv("PATH", fakeDir+string(os.PathListSeparator)+os.Getenv("PATH"))
+	argsPath := filepath.Join(t.TempDir(), "args.txt")
+	t.Setenv("WAZA_FAKE_CODEX_ARGS", argsPath)
+
+	dir := t.TempDir()
+	require.NoError(t, os.WriteFile(filepath.Join(dir, ".waza.yaml"), []byte(`defaults:
+  engine: codex
+  model: gpt-4o
+  model_reasoning_effort: high
+`), 0o644))
+
+	taskDir := filepath.Join(dir, "tasks")
+	require.NoError(t, os.MkdirAll(taskDir, 0o755))
+	task := `id: config-model-task
+name: Config Model Task
+inputs:
+  prompt: "Hello"
+`
+	require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(task), 0o644))
+
+	spec := `name: config-model-test
+skill: cfg-skill
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 10
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+tasks:
+  - "tasks/*.yaml"
+`
+	specPath := filepath.Join(dir, "eval.yaml")
+	require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644))
+
+	outFile := filepath.Join(t.TempDir(), "results.json")
+	cmd := newRunCommand()
+	cmd.SetArgs([]string{specPath, "--output", outFile})
+	cmd.SetOut(io.Discard)
+	cmd.SetErr(io.Discard)
+
+	err := cmd.Execute()
+	require.NoError(t, err)
+
+	data, err := os.ReadFile(outFile)
+	require.NoError(t, err)
+	var result models.EvaluationOutcome
+	require.NoError(t, json.Unmarshal(data, &result))
+	assert.Equal(t, "codex", result.Setup.EngineType)
+	assert.Equal(t, "gpt-4o", result.Setup.ModelID)
+
+	argsData, err := os.ReadFile(argsPath)
+	require.NoError(t, err)
+	args := string(argsData)
+	assert.Contains(t, args, "--model gpt-4o")
+	assert.Contains(t, args, `model_reasoning_effort="high"`)
+}
+
+func TestDisplayModelShowsCodexConfigDefault(t *testing.T) {
+	assert.Equal(t, "default (Codex config)", displayModel(models.Config{EngineType: "codex"}))
+	assert.Equal(t, "gpt-4o", displayModel(models.Config{
+		EngineType: "codex",
+		ModelID:    "gpt-4o",
+	}))
+}
+
+func TestRunCommand_CodexRejectsSkillInvocationGrader(t *testing.T) {
+	resetRunGlobals()
+	defer resetRunGlobals()
+
+	dir := t.TempDir()
+	taskDir := filepath.Join(dir, "tasks")
+	require.NoError(t, os.MkdirAll(taskDir, 0o755))
+	require.NoError(t, os.WriteFile(filepath.Join(taskDir, "task.yaml"), []byte(`id: skill-telemetry-task
+name: Skill Telemetry Task
+inputs:
+  prompt: "Hello"
+`), 0o644))
+
+	spec := `name: skill-telemetry-test
+skill: cfg-skill
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 10
+  executor: codex
+graders:
+  - type: skill_invocation
+    name: required_skill
+    config:
+      required_skills:
+        - cfg-skill
+tasks:
+  - "tasks/*.yaml"
+`
+	specPath := filepath.Join(dir, "eval.yaml")
+	require.NoError(t, os.WriteFile(specPath, []byte(spec), 0o644))
+
+	cmd := newRunCommand()
+	cmd.SetArgs([]string{specPath})
+	cmd.SetOut(io.Discard)
+	cmd.SetErr(io.Discard)
+
+	err := cmd.Execute()
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "skill_invocation")
+	assert.Contains(t, err.Error(), "not supported by the codex executor")
+}
+
 func TestRunCommand_MockEngineVerbose(t *testing.T) {
 	resetRunGlobals()
 
diff --git a/internal/execution/codex.go b/internal/execution/codex.go
new file mode 100644
index 000000000..4914d7f56
--- /dev/null
+++ b/internal/execution/codex.go
@@ -0,0 +1,513 @@
+package execution
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	copilot "github.com/github/copilot-sdk/go"
+	"github.com/microsoft/waza/internal/models"
+)
+
+// CodexEngine executes tasks through the local Codex CLI.
+//
+// The Codex CLI owns its own configuration and authentication discovery, so this
+// engine intentionally does not parse ~/.codex/config.toml or auth.json. It
+// invokes `codex exec` in Waza's isolated workspace and lets Codex load its
+// normal config/auth state.
+type CodexEngine struct {
+	defaultModelID string
+	binary         string
+	binaryPath     string
+
+	workspacesMu  sync.Mutex
+	workspaces    []string
+	keepWorkspace bool
+
+	initCalled atomic.Bool
+}
+
+// CodexEngineOption configures a CodexEngine.
+type CodexEngineOption func(*CodexEngine)
+
+// WithCodexBinary overrides the Codex executable path. It is mainly useful for
+// tests and for users who keep Codex outside PATH.
+func WithCodexBinary(path string) CodexEngineOption {
+	return func(e *CodexEngine) {
+		if path != "" {
+			e.binary = path
+		}
+	}
+}
+
+// NewCodexEngine creates a Codex-backed execution engine.
+func NewCodexEngine(defaultModelID string, opts ...CodexEngineOption) *CodexEngine {
+	e := &CodexEngine{
+		defaultModelID: defaultModelID,
+		binary:         "codex",
+	}
+	for _, opt := range opts {
+		opt(e)
+	}
+	return e
+}
+
+// SetKeepWorkspace enables or disables workspace preservation on shutdown.
+func (e *CodexEngine) SetKeepWorkspace(keep bool) {
+	e.keepWorkspace = keep
+}
+
+// Initialize verifies that the Codex CLI can be found. Codex itself handles
+// config/auth loading when the first task is executed.
+func (e *CodexEngine) Initialize(ctx context.Context) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	default:
+	}
+
+	path, err := exec.LookPath(e.binary)
+	if err != nil {
+		return fmt.Errorf("codex executable %q not found in PATH: %w", e.binary, err)
+	}
+	e.binaryPath = path
+	e.initCalled.Store(true)
+	return nil
+}
+
+// Execute runs a test prompt with `codex exec`.
+func (e *CodexEngine) Execute(ctx context.Context, req *ExecutionRequest) (*ExecutionResponse, error) {
+	if req == nil {
+		return nil, fmt.Errorf("nil req was passed to CodexEngine.Execute")
+	}
+	if !e.initCalled.Load() {
+		return nil, fmt.Errorf("engine was not initialized. Initialize needs to be called before Execute")
+	}
+	if req.Timeout <= 0 {
+		return nil, fmt.Errorf("positive Timeout is required")
+	}
+
+	modelID := e.defaultModelID
+	if req.ModelID != "" {
+		modelID = req.ModelID
+	}
+
+	sourceDir := req.SourceDir
+	if sourceDir == "" {
+		cwd, err := os.Getwd()
+		if err != nil {
+			return nil, fmt.Errorf("failed to get current directory: %w", err)
+		}
+		sourceDir = cwd
+	}
+
+	start := time.Now()
+
+	workspaceDir := req.WorkspaceDir
+	if workspaceDir == "" {
+		tmpDir, err := os.MkdirTemp("", "waza-codex-*")
+		if err != nil {
+			return nil, fmt.Errorf("failed to create codex workspace: %w", err)
+		}
+		workspaceDir = tmpDir
+		e.trackWorkspace(workspaceDir)
+
+		if err := setupWorkspaceResources(workspaceDir, req.Resources); err != nil {
+			return nil, fmt.Errorf("failed to setup codex workspace resources: %w", err)
+		}
+	}
+
+	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, req.Timeout)
+		defer cancel()
+	}
+
+	outputFile, err := os.CreateTemp("", "waza-codex-output-*.txt")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create codex output file: %w", err)
+	}
+	outputPath := outputFile.Name()
+	_ = outputFile.Close()
+	defer os.Remove(outputPath) //nolint:errcheck
+
+	if req.CancelOnSkillInvocation {
+		return nil, fmt.Errorf("codex engine does not support skill invocation telemetry required by trigger tests")
+	}
+
+	args := e.buildArgs(req, modelID, workspaceDir, outputPath)
+
+	prompt := e.buildPrompt(sourceDir, req)
+	cmd := exec.CommandContext(ctx, e.binaryPath, args...)
+	cmd.Dir = workspaceDir
+	cmd.Env = os.Environ()
+	cmd.Stdin = strings.NewReader(prompt)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	runErr := cmd.Run()
+	telemetry := parseCodexJSONEvents(stdout.String())
+	finalOutput := readCodexOutput(outputPath, telemetry.FinalOutput())
+
+	errMsg := ""
+	success := true
+	if runErr != nil {
+		success = false
+		errMsg = strings.TrimSpace(stderr.String())
+		if errMsg == "" {
+			errMsg = runErr.Error()
+		} else {
+			errMsg = fmt.Sprintf("%s: %v", errMsg, runErr)
+		}
+	}
+
+	sessionID := telemetry.SessionID
+	if sessionID == "" {
+		sessionID = req.SessionID
+	}
+	if sessionID == "" {
+		sessionID = fmt.Sprintf("codex-session-%d", time.Now().UnixNano())
+	}
+
+	return &ExecutionResponse{
+		FinalOutput:    finalOutput,
+		Events:         telemetry.Events,
+		ModelID:        modelID,
+		DurationMs:     time.Since(start).Milliseconds(),
+		ToolCalls:      models.FilterToolCalls(telemetry.Events),
+		ErrorMsg:       errMsg,
+		Success:        success,
+		WorkspaceDir:   workspaceDir,
+		WorkspaceFiles: captureWorkspaceFiles(workspaceDir),
+		SessionID:      sessionID,
+		Usage:          telemetry.Usage,
+	}, nil
+}
+
+func (e *CodexEngine) buildArgs(req *ExecutionRequest, modelID, workspaceDir, outputPath string) []string {
+	common := []string{
+		"-c", `approval_policy="never"`,
+		"--skip-git-repo-check",
+		"--output-last-message", outputPath,
+	}
+	if modelID != "" {
+		common = append(common, "--model", modelID)
+	}
+	if req.ModelReasoningEffort != "" {
+		common = append(common, "-c", fmt.Sprintf("model_reasoning_effort=%q", req.ModelReasoningEffort))
+	}
+
+	if req.SessionID != "" {
+		args := []string{
+			"exec",
+			"resume",
+			"--json",
+			"-c", `sandbox_mode="workspace-write"`,
+		}
+		args = append(args, common...)
+		args = append(args, req.SessionID, "-")
+		return args
+	}
+
+	args := []string{
+		"exec",
+		"--json",
+		"--cd", workspaceDir,
+		"--sandbox", "workspace-write",
+		"--color", "never",
+	}
+	args = append(args, common...)
+	args = append(args, "-")
+	return args
+}
+
+// Shutdown removes Codex workspaces created by this engine.
+func (e *CodexEngine) Shutdown(ctx context.Context) error {
+	workspaces := func() []string {
+		e.workspacesMu.Lock()
+		defer e.workspacesMu.Unlock()
+		ws := e.workspaces
+		e.workspaces = nil
+		return ws
+	}()
+
+	for _, ws := range workspaces {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+		if ws == "" {
+			continue
+		}
+		if e.keepWorkspace {
+			fmt.Fprintf(os.Stderr, "Workspace preserved: %s\n", ws)
+			continue
+		}
+		if err := os.RemoveAll(ws); err != nil {
+			return fmt.Errorf("failed to remove codex workspace %s: %w", ws, err)
+		}
+	}
+	return nil
+}
+
+// SessionUsage returns nil because codex exec does not currently expose Waza's
+// Copilot-style session usage digest.
+func (e *CodexEngine) SessionUsage(sessionID string) *models.UsageStats {
+	return nil
+}
+
+func (e *CodexEngine) trackWorkspace(path string) {
+	e.workspacesMu.Lock()
+	defer e.workspacesMu.Unlock()
+	e.workspaces = append(e.workspaces, path)
+}
+
+func (e *CodexEngine) buildPrompt(sourceDir string, req *ExecutionRequest) string {
+	var sb strings.Builder
+
+	if !req.NoSkills {
+		skillDirs := skillDirsForRequest(sourceDir, req)
+		if msg := buildSkillSystemMessage(skillDirs, req.SkillName); msg != "" {
+			sb.WriteString(msg)
+			sb.WriteString("\n")
+		}
+	}
+
+	if req.TaskName != "" || req.TaskDescription != "" || len(req.Context) > 0 {
+		sb.WriteString("<waza_task>\n")
+		if req.TaskName != "" {
+			fmt.Fprintf(&sb, "Name: %s\n", req.TaskName)
+		}
+		if req.TaskDescription != "" {
+			fmt.Fprintf(&sb, "Description: %s\n", req.TaskDescription)
+		}
+		if len(req.Context) > 0 {
+			sb.WriteString("Metadata:\n")
+			for k, v := range req.Context {
+				fmt.Fprintf(&sb, "- %s: %v\n", k, v)
+			}
+		}
+		sb.WriteString("</waza_task>\n\n")
+	}
+
+	sb.WriteString(req.Message)
+	return sb.String()
+}
+
+func readCodexOutput(outputPath, stdout string) string {
+	data, err := os.ReadFile(outputPath)
+	if err == nil && len(data) > 0 {
+		return string(data)
+	}
+	return stdout
+}
+
+type codexTelemetry struct {
+	SessionID string
+	Events    []copilot.SessionEvent
+	Usage     *models.UsageStats
+}
+
+func (t codexTelemetry) FinalOutput() string {
+	for i := len(t.Events) - 1; i >= 0; i-- {
+		evt := t.Events[i]
+		if evt.Type == copilot.AssistantMessage && evt.Data.Content != nil {
+			return *evt.Data.Content
+		}
+	}
+	return ""
+}
+
+type codexJSONEvent struct {
+	Type     string         `json:"type"`
+	ThreadID string         `json:"thread_id"`
+	Item     codexJSONItem  `json:"item"`
+	Usage    codexJSONUsage `json:"usage"`
+}
+
+type codexJSONItem struct {
+	ID               string                 `json:"id"`
+	Type             string                 `json:"type"`
+	Text             string                 `json:"text"`
+	Name             string                 `json:"name"`
+	ToolName         string                 `json:"tool_name"`
+	Command          string                 `json:"command"`
+	AggregatedOutput string                 `json:"aggregated_output"`
+	Output           string                 `json:"output"`
+	Status           string                 `json:"status"`
+	ExitCode         *int                   `json:"exit_code"`
+	Arguments        any                    `json:"arguments"`
+	Changes          []codexJSONFileChange  `json:"changes"`
+	Extra            map[string]interface{} `json:"-"`
+}
+
+type codexJSONFileChange struct {
+	Path string `json:"path"`
+	Kind string `json:"kind"`
+}
+
+type codexJSONUsage struct {
+	InputTokens           int `json:"input_tokens"`
+	CachedInputTokens     int `json:"cached_input_tokens"`
+	OutputTokens          int `json:"output_tokens"`
+	ReasoningOutputTokens int `json:"reasoning_output_tokens"`
+}
+
+func parseCodexJSONEvents(stdout string) codexTelemetry {
+	var telemetry codexTelemetry
+	for _, line := range strings.Split(stdout, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || !strings.HasPrefix(line, "{") {
+			continue
+		}
+
+		var event codexJSONEvent
+		if err := json.Unmarshal([]byte(line), &event); err != nil {
+			continue
+		}
+
+		switch event.Type {
+		case "thread.started":
+			telemetry.SessionID = event.ThreadID
+		case "item.started":
+			if evt, ok := codexItemToSessionEvent(event.Item, false); ok {
+				telemetry.Events = append(telemetry.Events, evt)
+			}
+		case "item.completed":
+			if event.Item.Type == "agent_message" {
+				if event.Item.Text != "" {
+					text := event.Item.Text
+					telemetry.Events = append(telemetry.Events, copilot.SessionEvent{
+						Type: copilot.AssistantMessage,
+						Data: copilot.Data{Content: &text},
+					})
+				}
+				continue
+			}
+			if evt, ok := codexItemToSessionEvent(event.Item, true); ok {
+				telemetry.Events = append(telemetry.Events, evt)
+			}
+		case "turn.completed":
+			usage := &models.UsageStats{
+				Turns:           1,
+				InputTokens:     event.Usage.InputTokens,
+				OutputTokens:    event.Usage.OutputTokens,
+				CacheReadTokens: event.Usage.CachedInputTokens,
+			}
+			if !usage.IsZero() {
+				telemetry.Usage = usage
+			}
+		}
+	}
+	return telemetry
+}
+
+func codexItemToSessionEvent(item codexJSONItem, completed bool) (copilot.SessionEvent, bool) {
+	toolName, args, resultText, ok := codexToolFields(item)
+	if !ok {
+		return copilot.SessionEvent{}, false
+	}
+
+	toolCallID := item.ID
+	if toolCallID == "" {
+		toolCallID = fmt.Sprintf("%s-%s", item.Type, toolName)
+	}
+
+	if !completed {
+		return copilot.SessionEvent{
+			Type: copilot.ToolExecutionStart,
+			Data: copilot.Data{
+				ToolCallID: &toolCallID,
+				ToolName:   &toolName,
+				Arguments:  args,
+			},
+		}, true
+	}
+
+	success := item.Status != "failed"
+	if item.ExitCode != nil && *item.ExitCode != 0 {
+		success = false
+	}
+	return copilot.SessionEvent{
+		Type: copilot.ToolExecutionComplete,
+		Data: copilot.Data{
+			ToolCallID: &toolCallID,
+			ToolName:   &toolName,
+			Success:    &success,
+			Result: &copilot.Result{
+				Content: &resultText,
+			},
+		},
+	}, true
+}
+
+func codexToolFields(item codexJSONItem) (string, any, string, bool) {
+	switch item.Type {
+	case "command_execution":
+		return "bash", map[string]any{"command": item.Command}, item.AggregatedOutput, true
+	case "file_change":
+		path := ""
+		kind := ""
+		if len(item.Changes) > 0 {
+			path = item.Changes[0].Path
+			kind = item.Changes[0].Kind
+		}
+		return "edit", map[string]any{"path": path, "command": kind}, item.Status, true
+	}
+
+	if strings.Contains(item.Type, "tool") {
+		name := item.Name
+		if name == "" {
+			name = item.ToolName
+		}
+		if name == "" {
+			name = item.Type
+		}
+		result := item.Output
+		if result == "" {
+			result = item.AggregatedOutput
+		}
+		if result == "" {
+			result = item.Status
+		}
+		return name, item.Arguments, result, true
+	}
+
+	return "", nil, "", false
+}
+
+func skillDirsForRequest(cwd string, req *ExecutionRequest) []string {
+	skillDirs := []string{cwd}
+	seen := map[string]bool{cwd: true}
+
+	for _, path := range req.SkillPaths {
+		if !seen[path] {
+			seen[path] = true
+			skillDirs = append(skillDirs, path)
+		}
+	}
+
+	return cleanSkillDirs(skillDirs)
+}
+
+func cleanSkillDirs(paths []string) []string {
+	cleaned := make([]string, 0, len(paths))
+	for _, path := range paths {
+		if path == "" {
+			continue
+		}
+		cleaned = append(cleaned, filepath.Clean(path))
+	}
+	return cleaned
+}
diff --git a/internal/execution/codex_test.go b/internal/execution/codex_test.go
new file mode 100644
index 000000000..52f862b5e
--- /dev/null
+++ b/internal/execution/codex_test.go
@@ -0,0 +1,195 @@
+package execution
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestCodexEngineExecuteUsesCLIWorkspaceAndSkillContext(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("fake codex shell script is POSIX-only")
+	}
+
+	fakeCodex := writeFakeCodex(t, 0)
+	sourceDir := t.TempDir()
+	skillDir := filepath.Join(sourceDir, "skills", "demo")
+	require.NoError(t, os.MkdirAll(skillDir, 0o755))
+	require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte("---\nname: demo\n---\nAlways mention workspace facts."), 0o644))
+
+	engine := NewCodexEngine("test-model", WithCodexBinary(fakeCodex))
+	require.NoError(t, engine.Initialize(context.Background()))
+	defer func() {
+		require.NoError(t, engine.Shutdown(context.Background()))
+	}()
+
+	resp, err := engine.Execute(context.Background(), &ExecutionRequest{
+		Message:              "Inspect the fixture.",
+		ModelReasoningEffort: "high",
+		Resources:            []ResourceFile{{Path: "input.txt", Content: []byte("fixture data")}},
+		SourceDir:            sourceDir,
+		SkillName:            "demo",
+		TaskName:             "Codex task",
+		TaskDescription:      "Verify fake execution.",
+		SkillPaths:           []string{filepath.Join(sourceDir, "skills")},
+		Timeout:              10 * time.Second,
+	})
+
+	require.NoError(t, err)
+	require.True(t, resp.Success)
+	require.Equal(t, "final from fake codex", resp.FinalOutput)
+	require.Equal(t, "test-model", resp.ModelID)
+	require.Contains(t, resp.WorkspaceFiles, "created.txt")
+	require.Equal(t, []byte("fixture data"), resp.WorkspaceFiles["input.txt"])
+
+	prompt := string(resp.WorkspaceFiles["prompt.txt"])
+	require.Contains(t, prompt, "<skill_context>")
+	require.Contains(t, prompt, "Always mention workspace facts.")
+	require.Contains(t, prompt, "Name: Codex task")
+
+	args := string(resp.WorkspaceFiles["args.txt"])
+	require.Contains(t, args, "--model test-model")
+	require.Contains(t, args, `approval_policy="never"`)
+	require.Contains(t, args, `model_reasoning_effort="high"`)
+	require.Contains(t, args, "--sandbox workspace-write")
+	require.NotContains(t, args, "--ephemeral")
+	require.Len(t, resp.ToolCalls, 1)
+	require.Equal(t, "bash", resp.ToolCalls[0].Name)
+	require.Equal(t, "codex-test-session", resp.SessionID)
+	require.NotNil(t, resp.Usage)
+	require.Equal(t, 12, resp.Usage.InputTokens)
+}
+
+func TestCodexEngineExecuteReportsCLIError(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("fake codex shell script is POSIX-only")
+	}
+
+	fakeCodex := writeFakeCodex(t, 7)
+	engine := NewCodexEngine("", WithCodexBinary(fakeCodex))
+	require.NoError(t, engine.Initialize(context.Background()))
+	defer func() {
+		require.NoError(t, engine.Shutdown(context.Background()))
+	}()
+
+	resp, err := engine.Execute(context.Background(), &ExecutionRequest{
+		Message: "fail",
+		Timeout: 10 * time.Second,
+	})
+
+	require.NoError(t, err)
+	require.False(t, resp.Success)
+	require.Contains(t, resp.ErrorMsg, "fake codex failed")
+	require.Equal(t, "final from fake codex", resp.FinalOutput)
+}
+
+func TestCodexEngineExecuteResumesSessionForFollowUp(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("fake codex shell script is POSIX-only")
+	}
+
+	fakeCodex := writeFakeCodex(t, 0)
+	engine := NewCodexEngine("", WithCodexBinary(fakeCodex))
+	require.NoError(t, engine.Initialize(context.Background()))
+	defer func() {
+		require.NoError(t, engine.Shutdown(context.Background()))
+	}()
+
+	first, err := engine.Execute(context.Background(), &ExecutionRequest{
+		Message: "Remember apple.",
+		Timeout: 10 * time.Second,
+	})
+	require.NoError(t, err)
+	require.Equal(t, "codex-test-session", first.SessionID)
+
+	second, err := engine.Execute(context.Background(), &ExecutionRequest{
+		Message:      "What did I ask you to remember?",
+		SessionID:    first.SessionID,
+		WorkspaceDir: first.WorkspaceDir,
+		Timeout:      10 * time.Second,
+	})
+	require.NoError(t, err)
+
+	args := string(second.WorkspaceFiles["args.txt"])
+	require.Contains(t, args, "exec resume")
+	require.Contains(t, args, "codex-test-session")
+	require.NotContains(t, args, "--ephemeral")
+	require.Equal(t, first.WorkspaceDir, second.WorkspaceDir)
+}
+
+func TestCodexEngineExecuteRejectsSkillTriggerTelemetry(t *testing.T) {
+	fakeCodex := writeFakeCodex(t, 0)
+	engine := NewCodexEngine("", WithCodexBinary(fakeCodex))
+	require.NoError(t, engine.Initialize(context.Background()))
+	defer func() {
+		require.NoError(t, engine.Shutdown(context.Background()))
+	}()
+
+	_, err := engine.Execute(context.Background(), &ExecutionRequest{
+		Message:                 "trigger?",
+		Timeout:                 10 * time.Second,
+		CancelOnSkillInvocation: true,
+	})
+	require.ErrorContains(t, err, "does not support skill invocation telemetry")
+}
+
+func writeFakeCodex(t *testing.T, exitCode int) string {
+	t.Helper()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "codex")
+	script := `#!/bin/sh
+set -u
+work=""
+out=""
+args=""
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --cd)
+      work="$2"
+      args="$args $1 $2"
+      shift 2
+      ;;
+    --output-last-message)
+      out="$2"
+      args="$args $1 $2"
+      shift 2
+      ;;
+    *)
+      args="$args $1"
+      shift
+      ;;
+  esac
+done
+if [ -n "$work" ]; then
+  cd "$work"
+fi
+cat > prompt.txt
+printf "%s" "$args" > args.txt
+printf "created by fake codex" > created.txt
+if [ -n "$out" ]; then
+  printf "final from fake codex" > "$out"
+else
+  printf "final from fake codex"
+fi
+cat <<'JSON'
+{"type":"thread.started","thread_id":"codex-test-session"}
+{"type":"item.started","item":{"id":"item_0","type":"command_execution","command":"/bin/sh -c pwd","aggregated_output":"","exit_code":null,"status":"in_progress"}}
+{"type":"item.completed","item":{"id":"item_0","type":"command_execution","command":"/bin/sh -c pwd","aggregated_output":"fake pwd\n","exit_code":0,"status":"completed"}}
+{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"final from fake codex"}}
+{"type":"turn.completed","usage":{"input_tokens":12,"cached_input_tokens":3,"output_tokens":4,"reasoning_output_tokens":1}}
+JSON
+if [ ` + strconv.Itoa(exitCode) + ` -ne 0 ]; then
+  printf "fake codex failed\n" >&2
+  exit ` + strconv.Itoa(exitCode) + `
+fi
+`
+	require.NoError(t, os.WriteFile(path, []byte(script), 0o755))
+	return path
+}
diff --git a/internal/execution/engine.go b/internal/execution/engine.go
index acbd870e5..907e7d54a 100644
--- a/internal/execution/engine.go
+++ b/internal/execution/engine.go
@@ -36,10 +36,11 @@ type WorkspaceKeeper interface {
 
 // ExecutionRequest represents a test execution request
 type ExecutionRequest struct {
-	ModelID   string
-	Message   string
-	Context   map[string]any
-	Resources []ResourceFile
+	ModelID              string
+	ModelReasoningEffort string
+	Message              string
+	Context              map[string]any
+	Resources            []ResourceFile
 
 	SessionID    string
 	WorkspaceDir string // Reuse an existing workspace directory (for follow-up prompts)
diff --git a/internal/models/spec.go b/internal/models/spec.go
index 6f8c33545..de423bfb7 100644
--- a/internal/models/spec.go
+++ b/internal/models/spec.go
@@ -35,20 +35,21 @@ type SpecIdentity struct {
 
 // Config controls execution behavior
 type Config struct {
-	TrialsPerTask  int            `yaml:"trials_per_task" json:"runs_per_test"`
-	TimeoutSec     int            `yaml:"timeout_seconds" json:"timeout_sec"`
-	Concurrent     bool           `yaml:"parallel" json:"concurrent"`
-	Workers        int            `yaml:"workers,omitempty" json:"workers,omitempty"`
-	StopOnError    bool           `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
-	EngineType     string         `yaml:"executor" json:"engine_type"`
-	ModelID        string         `yaml:"model" json:"model_id"`
-	SkillPaths     []string       `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
-	DisabledSkills []string       `yaml:"disabled_skills,omitempty" json:"disabled_skills,omitempty"`
-	RequiredSkills []string       `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
-	ServerConfigs  map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
-	MaxAttempts    int            `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
-	GroupBy        string         `yaml:"group_by,omitempty" json:"group_by,omitempty"`
-	JudgeModel     string         `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
+	TrialsPerTask        int            `yaml:"trials_per_task" json:"runs_per_test"`
+	TimeoutSec           int            `yaml:"timeout_seconds" json:"timeout_sec"`
+	Concurrent           bool           `yaml:"parallel" json:"concurrent"`
+	Workers              int            `yaml:"workers,omitempty" json:"workers,omitempty"`
+	StopOnError          bool           `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
+	EngineType           string         `yaml:"executor" json:"engine_type"`
+	ModelID              string         `yaml:"model" json:"model_id"`
+	ModelReasoningEffort string         `yaml:"model_reasoning_effort,omitempty" json:"model_reasoning_effort,omitempty"`
+	SkillPaths           []string       `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
+	DisabledSkills       []string       `yaml:"disabled_skills,omitempty" json:"disabled_skills,omitempty"`
+	RequiredSkills       []string       `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
+	ServerConfigs        map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
+	MaxAttempts          int            `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
+	GroupBy              string         `yaml:"group_by,omitempty" json:"group_by,omitempty"`
+	JudgeModel           string         `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
 }
 
 // GraderConfig defines a validator/grader
diff --git a/internal/models/spec_test.go b/internal/models/spec_test.go
index db0a437ba..25ce0e0f6 100644
--- a/internal/models/spec_test.go
+++ b/internal/models/spec_test.go
@@ -364,6 +364,32 @@ config:
 	})
 }
 
+func TestEvalSpec_ModelReasoningEffort(t *testing.T) {
+	tempDir := t.TempDir()
+	yamlContent := `name: reasoning-test
+skill: test
+config:
+  trials_per_task: 1
+  timeout_seconds: 60
+  executor: codex
+  model: gpt-4o
+  model_reasoning_effort: high
+`
+	specPath := filepath.Join(tempDir, "reasoning.yaml")
+	if err := os.WriteFile(specPath, []byte(yamlContent), 0644); err != nil {
+		t.Fatalf("Failed to write spec file: %v", err)
+	}
+
+	spec, err := LoadEvalSpec(specPath)
+	if err != nil {
+		t.Fatalf("Failed to load spec: %v", err)
+	}
+
+	if spec.Config.ModelReasoningEffort != "high" {
+		t.Errorf("Expected model_reasoning_effort='high', got '%s'", spec.Config.ModelReasoningEffort)
+	}
+}
+
 func TestConfig_AllSkillsDisabled(t *testing.T) {
 	tests := []struct {
 		name     string
diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go
index f9eaf731a..6d3ecb159 100644
--- a/internal/orchestration/runner.go
+++ b/internal/orchestration/runner.go
@@ -1158,16 +1158,18 @@ func (r *EvalRunner) buildExecutionRequest(tc *models.TestCase) *execution.Execu
 	noSkills := spec.Config.AllSkillsDisabled()
 
 	return &execution.ExecutionRequest{
-		Message:         tc.Stimulus.Message,
-		Context:         tc.Stimulus.Metadata,
-		Resources:       resources,
-		SkillName:       spec.SkillName,
-		TaskName:        tc.DisplayName,
-		TaskDescription: tc.Summary,
-		SkillPaths:      resolvedSkillPaths,
-		NoSkills:        noSkills,
-		Timeout:         time.Duration(timeout) * time.Second,
-		MCPServers:      convertMCPServers(spec.Config.ServerConfigs),
+		ModelID:              spec.Config.ModelID,
+		ModelReasoningEffort: spec.Config.ModelReasoningEffort,
+		Message:              tc.Stimulus.Message,
+		Context:              tc.Stimulus.Metadata,
+		Resources:            resources,
+		SkillName:            spec.SkillName,
+		TaskName:             tc.DisplayName,
+		TaskDescription:      tc.Summary,
+		SkillPaths:           resolvedSkillPaths,
+		NoSkills:             noSkills,
+		Timeout:              time.Duration(timeout) * time.Second,
+		MCPServers:           convertMCPServers(spec.Config.ServerConfigs),
 	}
 }
 
diff --git a/internal/projectconfig/config.go b/internal/projectconfig/config.go
index cac235e6e..7ec5fb579 100644
--- a/internal/projectconfig/config.go
+++ b/internal/projectconfig/config.go
@@ -56,14 +56,15 @@ type PathsConfig struct {
 
 // DefaultsConfig holds default execution parameters.
 type DefaultsConfig struct {
-	Engine     string `yaml:"engine,omitempty"`
-	Model      string `yaml:"model,omitempty"`
-	JudgeModel string `yaml:"judgeModel,omitempty"`
-	Timeout    int    `yaml:"timeout,omitempty"`
-	Parallel   *bool  `yaml:"parallel,omitempty"`
-	Workers    int    `yaml:"workers,omitempty"`
-	Verbose    *bool  `yaml:"verbose,omitempty"`
-	SessionLog *bool  `yaml:"sessionLog,omitempty"`
+	Engine               string `yaml:"engine,omitempty"`
+	Model                string `yaml:"model,omitempty"`
+	ModelReasoningEffort string `yaml:"model_reasoning_effort,omitempty"`
+	JudgeModel           string `yaml:"judgeModel,omitempty"`
+	Timeout              int    `yaml:"timeout,omitempty"`
+	Parallel             *bool  `yaml:"parallel,omitempty"`
+	Workers              int    `yaml:"workers,omitempty"`
+	Verbose              *bool  `yaml:"verbose,omitempty"`
+	SessionLog           *bool  `yaml:"sessionLog,omitempty"`
 }
 
 // CacheConfig holds cache settings.
@@ -146,14 +147,15 @@ func New() *ProjectConfig {
 			Results: DefaultResultsDir,
 		},
 		Defaults: DefaultsConfig{
-			Engine:     DefaultEngine,
-			Model:      DefaultModel,
-			JudgeModel: "",
-			Timeout:    DefaultTimeout,
-			Parallel:   boolPtr(false),
-			Workers:    DefaultWorkers,
-			Verbose:    boolPtr(false),
-			SessionLog: boolPtr(false),
+			Engine:               DefaultEngine,
+			Model:                DefaultModel,
+			ModelReasoningEffort: "",
+			JudgeModel:           "",
+			Timeout:              DefaultTimeout,
+			Parallel:             boolPtr(false),
+			Workers:              DefaultWorkers,
+			Verbose:              boolPtr(false),
+			SessionLog:           boolPtr(false),
 		},
 		Cache: CacheConfig{
 			Enabled: boolPtr(false),
@@ -206,9 +208,17 @@ func Load(startDir string) (*ProjectConfig, error) {
 	if err := decoder.Decode(&fileCfg); err != nil {
 		return nil, fmt.Errorf("parsing .waza.yaml: %w", err)
 	}
+	defaultsModelSet := hasDefaultsField(data, "model")
 
 	// Merge file values onto defaults.
 	mergeConfig(cfg, &fileCfg)
+	if defaultsModelSet {
+		// An explicit empty model is meaningful for engines such as codex, where
+		// the underlying tool can read its default model from its own config.
+		cfg.Defaults.Model = fileCfg.Defaults.Model
+	} else if fileCfg.Defaults.Engine == "codex" {
+		cfg.Defaults.Model = ""
+	}
 
 	return cfg, nil
 }
@@ -262,6 +272,9 @@ func mergeConfig(dst, src *ProjectConfig) {
 	if src.Defaults.Model != "" {
 		dst.Defaults.Model = src.Defaults.Model
 	}
+	if src.Defaults.ModelReasoningEffort != "" {
+		dst.Defaults.ModelReasoningEffort = src.Defaults.ModelReasoningEffort
+	}
 	if src.Defaults.JudgeModel != "" {
 		dst.Defaults.JudgeModel = src.Defaults.JudgeModel
 	}
@@ -340,3 +353,26 @@ func mergeConfig(dst, src *ProjectConfig) {
 func boolPtr(b bool) *bool {
 	return &b
 }
+
+func hasDefaultsField(data []byte, field string) bool {
+	var root yaml.Node
+	if err := yaml.Unmarshal(data, &root); err != nil {
+		return false
+	}
+	if len(root.Content) == 0 || root.Content[0].Kind != yaml.MappingNode {
+		return false
+	}
+	top := root.Content[0]
+	for i := 0; i+1 < len(top.Content); i += 2 {
+		if top.Content[i].Value != "defaults" || top.Content[i+1].Kind != yaml.MappingNode {
+			continue
+		}
+		defaults := top.Content[i+1]
+		for j := 0; j+1 < len(defaults.Content); j += 2 {
+			if defaults.Content[j].Value == field {
+				return true
+			}
+		}
+	}
+	return false
+}
diff --git a/internal/projectconfig/config_test.go b/internal/projectconfig/config_test.go
index 47b497990..04b19d4bc 100644
--- a/internal/projectconfig/config_test.go
+++ b/internal/projectconfig/config_test.go
@@ -17,6 +17,7 @@ func TestNew_ReturnsAllDefaults(t *testing.T) {
 	// Defaults
 	assertEqual(t, "Defaults.Engine", "copilot-sdk", cfg.Defaults.Engine)
 	assertEqual(t, "Defaults.Model", "claude-sonnet-4.6", cfg.Defaults.Model)
+	assertEqual(t, "Defaults.ModelReasoningEffort", "", cfg.Defaults.ModelReasoningEffort)
 	assertEqual(t, "Defaults.JudgeModel", "", cfg.Defaults.JudgeModel)
 	assertEqualInt(t, "Defaults.Timeout", 300, cfg.Defaults.Timeout)
 	assertBoolPtr(t, "Defaults.Parallel", false, cfg.Defaults.Parallel)
@@ -58,6 +59,7 @@ paths:
 defaults:
   engine: mock
   model: gpt-4o
+  model_reasoning_effort: high
   judgeModel: claude-sonnet-4.6
   timeout: 600
   parallel: true
@@ -96,6 +98,7 @@ graders:
 	assertEqual(t, "Paths.Results", "custom-results/", cfg.Paths.Results)
 	assertEqual(t, "Defaults.Engine", "mock", cfg.Defaults.Engine)
 	assertEqual(t, "Defaults.Model", "gpt-4o", cfg.Defaults.Model)
+	assertEqual(t, "Defaults.ModelReasoningEffort", "high", cfg.Defaults.ModelReasoningEffort)
 	assertEqual(t, "Defaults.JudgeModel", "claude-sonnet-4.6", cfg.Defaults.JudgeModel)
 	assertEqualInt(t, "Defaults.Timeout", 600, cfg.Defaults.Timeout)
 	assertBoolPtr(t, "Defaults.Parallel", true, cfg.Defaults.Parallel)
@@ -148,6 +151,39 @@ defaults:
 	assertEqualInt(t, "Graders.ProgramTimeout", 30, cfg.Graders.ProgramTimeout)
 }
 
+func TestLoad_CodexDefaultsAllowConfigModel(t *testing.T) {
+	dir := t.TempDir()
+	writeFile(t, dir, ".waza.yaml", `
+defaults:
+  engine: codex
+`)
+
+	cfg, err := Load(dir)
+	if err != nil {
+		t.Fatalf("Load() error: %v", err)
+	}
+
+	assertEqual(t, "Defaults.Engine", "codex", cfg.Defaults.Engine)
+	assertEqual(t, "Defaults.Model", "", cfg.Defaults.Model)
+}
+
+func TestLoad_ExplicitEmptyModelOverridesDefault(t *testing.T) {
+	dir := t.TempDir()
+	writeFile(t, dir, ".waza.yaml", `
+defaults:
+  engine: codex
+  model: ""
+`)
+
+	cfg, err := Load(dir)
+	if err != nil {
+		t.Fatalf("Load() error: %v", err)
+	}
+
+	assertEqual(t, "Defaults.Engine", "codex", cfg.Defaults.Engine)
+	assertEqual(t, "Defaults.Model", "", cfg.Defaults.Model)
+}
+
 func TestLoad_MissingFile_ReturnsDefaults(t *testing.T) {
 	dir := t.TempDir()
 
diff --git a/internal/projectconfig/schema_parity_test.go b/internal/projectconfig/schema_parity_test.go
index 7487df360..69961ec5b 100644
--- a/internal/projectconfig/schema_parity_test.go
+++ b/internal/projectconfig/schema_parity_test.go
@@ -65,6 +65,7 @@ func TestSchemaDefaultsMatchGoDefaults(t *testing.T) {
 	// --- defaults ---
 	assertStringDefault(t, getDefault("defaults", "engine"), cfg.Defaults.Engine, "defaults.engine")
 	assertStringDefault(t, getDefault("defaults", "model"), cfg.Defaults.Model, "defaults.model")
+	assertStringDefault(t, getDefault("defaults", "model_reasoning_effort"), cfg.Defaults.ModelReasoningEffort, "defaults.model_reasoning_effort")
 	assertIntDefault(t, getDefault("defaults", "timeout"), cfg.Defaults.Timeout, "defaults.timeout")
 	assertBoolDefault(t, getDefault("defaults", "parallel"), *cfg.Defaults.Parallel, "defaults.parallel")
 	assertIntDefault(t, getDefault("defaults", "workers"), cfg.Defaults.Workers, "defaults.workers")
diff --git a/internal/scaffold/scaffold.go b/internal/scaffold/scaffold.go
index 838c4e23a..ca2d0f5f5 100644
--- a/internal/scaffold/scaffold.go
+++ b/internal/scaffold/scaffold.go
@@ -44,7 +44,8 @@ func TitleCase(s string) string {
 }
 
 // ReadProjectDefaults reads engine and model from .waza.yaml if it exists.
-// Falls back to copilot-sdk and claude-sonnet-4.6.
+// Falls back to copilot-sdk and claude-sonnet-4.6. Codex projects may return
+// an empty model so the Codex CLI can use ~/.codex/config.toml.
 func ReadProjectDefaults() (engine, model string) {
 	dir, err := os.Getwd()
 	if err != nil {
@@ -59,6 +60,11 @@ func ReadProjectDefaults() (engine, model string) {
 
 // EvalYAML returns a default eval.yaml template for the given skill name.
 func EvalYAML(name, engine, model string) string {
+	modelLine := ""
+	if model != "" {
+		modelLine = fmt.Sprintf("  model: %s\n", model)
+	}
+
 	return fmt.Sprintf(`name: %s-eval
 description: Evaluation suite for %s.
 skill: %s
@@ -68,7 +74,7 @@ config:
   timeout_seconds: 300
   parallel: false
   executor: %s
-  model: %s
+%s
 metrics:
   - name: task_completion
     weight: 1.0
@@ -87,7 +93,7 @@ graders:
         - "(?i)(explain|describe|analyze|implement)"
 tasks:
   - "tasks/*.yaml"
-`, name, name, name, engine, model)
+`, name, name, name, engine, modelLine)
 }
 
 // TaskFiles returns a map of task filename to content.
diff --git a/internal/scaffold/scaffold_test.go b/internal/scaffold/scaffold_test.go
index 651651c81..4b89eaea0 100644
--- a/internal/scaffold/scaffold_test.go
+++ b/internal/scaffold/scaffold_test.go
@@ -81,6 +81,13 @@ func TestEvalYAML_CustomEngine(t *testing.T) {
 	assert.Contains(t, content, "model: gpt-4o")
 }
 
+func TestEvalYAML_OmitsEmptyModel(t *testing.T) {
+	content := EvalYAML("my-skill", "codex", "")
+
+	assert.Contains(t, content, "executor: codex")
+	assert.NotContains(t, content, "model:")
+}
+
 func TestTaskFiles(t *testing.T) {
 	tasks := TaskFiles("my-skill")
 
diff --git a/internal/validation/schema_test.go b/internal/validation/schema_test.go
index 874b05f7e..f22cd64e9 100644
--- a/internal/validation/schema_test.go
+++ b/internal/validation/schema_test.go
@@ -56,6 +56,26 @@ func TestValidateEvalBytes_Valid(t *testing.T) {
 	require.Empty(t, errs, "valid eval should have no errors")
 }
 
+func TestValidateEvalBytes_CodexModelOptional(t *testing.T) {
+	yaml := `name: test-eval
+description: Test evaluation
+skill: test-skill
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 60
+  executor: codex
+metrics:
+  - name: accuracy
+    weight: 1.0
+    threshold: 0.8
+tasks:
+  - "tasks/*.yaml"
+`
+	errs := ValidateEvalBytes([]byte(yaml))
+	require.Empty(t, errs, "codex eval should allow model to come from ~/.codex/config.toml")
+}
+
 func TestValidateEvalBytes_Invalid(t *testing.T) {
 	errs := ValidateEvalBytes([]byte(invalidEvalYAML))
 	require.NotEmpty(t, errs, "invalid eval should have errors")
diff --git a/schemas/config.schema.json b/schemas/config.schema.json
index a413eee22..45cb270e1 100644
--- a/schemas/config.schema.json
+++ b/schemas/config.schema.json
@@ -31,19 +31,25 @@
       "type": "object",
       "description": "Default values applied to evaluations. Used as fallbacks by 'waza run' and as defaults for 'waza new'.",
       "properties": {
-        "engine": {
-          "type": "string",
-          "description": "Execution engine for evaluations.",
-          "enum": ["copilot-sdk", "mock"],
-          "default": "copilot-sdk"
-        },
-        "model": {
-          "type": "string",
-          "description": "Default model identifier for evaluations.",
-          "examples": ["claude-sonnet-4.6", "gpt-4o", "claude-sonnet-4"],
-          "default": "claude-sonnet-4.6"
-        },
-        "judgeModel": {
+        "engine": {
+          "type": "string",
+          "description": "Execution engine for evaluations. Use 'codex' to run through the local Codex CLI and its ~/.codex config/auth.",
+          "enum": ["copilot-sdk", "codex", "mock"],
+          "default": "copilot-sdk"
+        },
+        "model": {
+          "type": "string",
+          "description": "Default model identifier for evaluations. Optional for the 'codex' engine when ~/.codex/config.toml provides the model.",
+          "examples": ["claude-sonnet-4.6", "gpt-4o", "claude-sonnet-4"],
+          "default": "claude-sonnet-4.6"
+        },
+        "model_reasoning_effort": {
+          "type": "string",
+          "description": "Codex model reasoning effort passed as a Codex config override. Valid values depend on the Codex CLI/model; commonly: none, minimal, low, medium, high, xhigh.",
+          "enum": ["none", "minimal", "low", "medium", "high", "xhigh"],
+          "default": ""
+        },
+        "judgeModel": {
           "type": "string",
           "description": "Model used for LLM-as-judge grading. If omitted, uses the same model as the evaluation.",
           "examples": ["claude-sonnet-4.6", "gpt-4o"]
diff --git a/schemas/eval.schema.json b/schemas/eval.schema.json
index b5b355242..98b675743 100644
--- a/schemas/eval.schema.json
+++ b/schemas/eval.schema.json
@@ -95,8 +95,7 @@
       "required": [
         "trials_per_task",
         "timeout_seconds",
-        "executor",
-        "model"
+        "executor"
       ],
       "additionalProperties": false,
       "description": "Execution configuration for the evaluation.",
@@ -130,20 +129,33 @@
           "type": "string",
           "enum": [
             "copilot-sdk",
+            "codex",
             "mock"
           ],
-          "description": "Execution engine to use. 'copilot-sdk' for real evaluations, 'mock' for testing."
+          "description": "Execution engine to use. 'copilot-sdk' for GitHub Copilot SDK evaluations, 'codex' for local Codex CLI evaluations, 'mock' for testing."
         },
         "model": {
           "type": "string",
           "minLength": 1,
-          "description": "Default model identifier for evaluations.",
+          "description": "Default model identifier for evaluations. Optional for the 'codex' executor when ~/.codex/config.toml provides the model.",
           "examples": [
             "gpt-4o",
             "claude-sonnet-4-20250514",
             "gpt-4o-mini"
           ]
         },
+        "model_reasoning_effort": {
+          "type": "string",
+          "enum": [
+            "none",
+            "minimal",
+            "low",
+            "medium",
+            "high",
+            "xhigh"
+          ],
+          "description": "Codex model reasoning effort passed as a Codex config override."
+        },
         "max_attempts": {
           "type": "integer",
           "minimum": 1,
diff --git a/site/src/content/docs/guides/eval-yaml.mdx b/site/src/content/docs/guides/eval-yaml.mdx
index e9b1bfa5d..3aa6b1464 100644
--- a/site/src/content/docs/guides/eval-yaml.mdx
+++ b/site/src/content/docs/guides/eval-yaml.mdx
@@ -97,8 +97,9 @@ config:
   parallel: false # Run tasks sequentially (true = concurrent)
   workers: 4 # Parallel workers if parallel: true
   model: claude-sonnet-4.6 # Default model (override with --model)
+  model_reasoning_effort: high # Codex-only reasoning effort override
   judge_model: gpt-4o # Model for LLM-as-judge graders (optional)
-  executor: mock # mock (local) or copilot-sdk (real API)
+  executor: mock # mock (local), copilot-sdk (GitHub Copilot), or codex (local Codex CLI)
 ```
 
 | Field               | Type      | Default           | Description                                                 |
@@ -107,9 +108,10 @@ config:
 | `timeout_seconds`   | int       | 300               | Task timeout in seconds                                     |
 | `parallel`          | bool      | false             | Run tasks concurrently                                      |
 | `workers`           | int       | 4                 | Number of parallel workers                                  |
-| `model`             | string    | _required_        | Default model for tasks (override with `--model` flag)      |
+| `model`             | string    | _required_        | Default model for tasks (override with `--model` flag). Optional for `codex` when `~/.codex/config.toml` provides the model. |
+| `model_reasoning_effort` | string | —                 | Codex reasoning effort passed via `-c model_reasoning_effort=...`; commonly `none`, `minimal`, `low`, `medium`, `high`, or `xhigh` |
 | `judge_model`       | string    | (same as `model`) | Model for `prompt`-type graders (LLM-as-judge)              |
-| `executor`          | string    | `copilot-sdk`     | Executor: `mock` (local, echoes task metadata and file content) or `copilot-sdk` (real API)  |
+| `executor`          | string    | `copilot-sdk`     | Executor: `mock` (local, echoes task metadata and file content), `copilot-sdk` (GitHub Copilot SDK), or `codex` (local Codex CLI using `~/.codex` config/auth). |
 | `max_attempts`      | int       | 0                 | Maximum retry attempts per task on failure (0 = no retries) |
 | `group_by`          | string    | —                 | Group results by a field (e.g., `tags`, `task_id`)          |
 | `fail_fast`         | bool      | false             | Stop the entire run on first task failure                   |
diff --git a/site/src/content/docs/reference/schema.mdx b/site/src/content/docs/reference/schema.mdx
index 612d0312e..586fa6550 100644
--- a/site/src/content/docs/reference/schema.mdx
+++ b/site/src/content/docs/reference/schema.mdx
@@ -145,18 +145,31 @@ config:
 **Type:** string  
 **Default:** (empty)
 
-Default LLM model. Override with `--model` flag.
+Default LLM model. Override with `--model` flag. Optional for `codex` when `~/.codex/config.toml` provides the model.
 
 ```yaml
 config:
   model: claude-sonnet-4.6
 ```
 
+### model_reasoning_effort
+
+**Type:** string  
+**Default:** (empty)
+
+Codex reasoning effort passed through as `-c model_reasoning_effort=...`. Common values are `none`, `minimal`, `low`, `medium`, `high`, and `xhigh`.
+
+```yaml
+config:
+  executor: codex
+  model_reasoning_effort: high
+```
+
 ### executor
 
 **Type:** string  
 **Default:** mock  
-**Options:** `mock`, `copilot-sdk`
+**Options:** `mock`, `copilot-sdk`, `codex`
 
 Execution engine:
 
@@ -609,6 +622,8 @@ paths:
 
 # Model defaults
 defaults:
+  engine: codex
+  model_reasoning_effort: high
   model: claude-sonnet-4.6
   timeout: 300
   workers: 4
diff --git a/site/src/content/docs/reference/waza-yaml.mdx b/site/src/content/docs/reference/waza-yaml.mdx
index 57f408313..896e89aaa 100644
--- a/site/src/content/docs/reference/waza-yaml.mdx
+++ b/site/src/content/docs/reference/waza-yaml.mdx
@@ -15,7 +15,9 @@ paths:
   results: results/
 
 defaults:
-  model: claude-sonnet-4.6
+  engine: codex
+  model: gpt-4o
+  model_reasoning_effort: high
   timeout: 300
   workers: 4
 ```
@@ -38,8 +40,9 @@ Default execution parameters applied to all commands unless overridden by CLI fl
 
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
-| `engine` | string | `copilot-sdk` | Execution engine |
-| `model` | string | `claude-sonnet-4.6` | Default model for execution |
+| `engine` | string | `copilot-sdk` | Execution engine: `copilot-sdk`, `codex`, or `mock` |
+| `model` | string | `claude-sonnet-4.6` | Default model for execution. Optional for `codex` when `~/.codex/config.toml` provides it |
+| `model_reasoning_effort` | string | | Codex reasoning effort passed via `-c model_reasoning_effort=...`; commonly `none`, `minimal`, `low`, `medium`, `high`, or `xhigh` |
 | `judgeModel` | string | | Model for LLM-as-judge graders |
 | `timeout` | int | `300` | Task timeout in seconds |
 | `parallel` | bool | `false` | Enable parallel execution |